import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import entropy
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on total population (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Serbia = df[(df.location == "Serbia")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Czechia = df[(df.location == "Czechia")]
df_Romania = df[(df.location == "Romania")]
df_Denmark = df[(df.location == "Denmark")]
df_Ireland = df[(df.location == "Ireland")]
df_Estonia = df[(df.location == "Estonia")]
df_Latvia = df[(df.location == "Latvia")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_France = df[(df.location == "France")]
df_Italy = df[(df.location == "Italy")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]
df_Portugal = df[(df.location == "Portugal")]
df_Spain = df[(df.location == "Spain")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Switzerland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322149 |
2078 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989966036956606
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002337643207376714 R2 Score: 0.9987160137717644 RMSE: 0.048349 Entropy Value: 0.0003719633457117023
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.943736 |
| 6 | median_age | 0.039143 |
| 2 | female_smokers | 0.012458 |
| 3 | male_smokers | 0.001942 |
| 0 | cardiovasc_death_rate | 0.001906 |
| 5 | aged_65_older | 0.000619 |
| 4 | life_expectancy | 0.000196 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Switzerland'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 106.749 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 106.749 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 106.749 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 106.749 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 106.749 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 0.322149 |
2078 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989300549094547
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015429193968664893 R2 Score: 0.9991525279603822 RMSE: 0.039280 Entropy Value: 0.0003132008576474396
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.974799 |
| 2 | extreme_poverty | 0.022650 |
| 3 | gdp_per_capita | 0.001858 |
| 0 | hospital_beds_per_thousand | 0.000376 |
| 4 | population_density | 0.000317 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2132 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9969839146584258
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.041054134347689675 R2 Score: 0.996895600824653 RMSE: 0.202618 Entropy Value: 0.0008300099316689873
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.902614 |
| 0 | cardiovasc_death_rate | 0.058815 |
| 2 | female_smokers | 0.024297 |
| 5 | aged_65_older | 0.006879 |
| 6 | median_age | 0.005122 |
| 3 | male_smokers | 0.001783 |
| 4 | life_expectancy | 0.000490 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 1.093162 |
2132 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9974349402891136
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.03575065581898076 R2 Score: 0.9972966350842372 RMSE: 0.189078 Entropy Value: 0.0008915270369744506
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.944229 |
| 0 | hospital_beds_per_thousand | 0.030624 |
| 2 | extreme_poverty | 0.022911 |
| 3 | gdp_per_capita | 0.001623 |
| 4 | population_density | 0.000613 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Serbia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716205 |
2065 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9583831913274017
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005568712809238791 R2 Score: 0.99727493711111 RMSE: 0.074624 Entropy Value: 0.000639391380225498
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.809015 |
| 5 | aged_65_older | 0.109960 |
| 2 | female_smokers | 0.032594 |
| 3 | male_smokers | 0.025727 |
| 1 | diabetes_prevalence | 0.016079 |
| 4 | life_expectancy | 0.005342 |
| 6 | median_age | 0.001282 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Serbia'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 0.716205 |
2065 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9565151571739534
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006760468873536101 R2 Score: 0.9966917484399258 RMSE: 0.082222 Entropy Value: 0.000440139151045257
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | hospital_beds_per_thousand | 0.764675 |
| 1 | human_development_index | 0.156947 |
| 2 | extreme_poverty | 0.037875 |
| 3 | gdp_per_capita | 0.026132 |
| 4 | population_density | 0.014372 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Luxembourg'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
2068 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9919358488759082
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0010130224723005955 R2 Score: 0.997337568618319 RMSE: 0.031828 Entropy Value: 0.00045829706608428716
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.795629 |
| 0 | cardiovasc_death_rate | 0.146630 |
| 2 | female_smokers | 0.035997 |
| 5 | aged_65_older | 0.010106 |
| 3 | male_smokers | 0.005179 |
| 6 | median_age | 0.003941 |
| 4 | life_expectancy | 0.002516 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Luxembourg'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 0.20 | 94277.965 | 231.447 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 0.20 | 94277.965 | 231.447 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 0.20 | 94277.965 | 231.447 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 0.20 | 94277.965 | 231.447 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 0.20 | 94277.965 | 231.447 | 0.377872 |
2068 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9884045961442933
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0008681898404388339 R2 Score: 0.9977182185591683 RMSE: 0.029465 Entropy Value: 0.00034460747180680264
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.906843 |
| 2 | extreme_poverty | 0.047073 |
| 0 | hospital_beds_per_thousand | 0.033660 |
| 3 | gdp_per_capita | 0.009333 |
| 4 | population_density | 0.003090 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Romania'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
2072 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9954484396430019
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00747804703817538 R2 Score: 0.9948675599333417 RMSE: 0.086476 Entropy Value: 0.0006442605230367055
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.835149 |
| 1 | diabetes_prevalence | 0.111452 |
| 2 | female_smokers | 0.020746 |
| 6 | median_age | 0.020522 |
| 5 | aged_65_older | 0.008818 |
| 3 | male_smokers | 0.002733 |
| 4 | life_expectancy | 0.000580 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Romania'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 2.036403 |
2072 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9968976083567679
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006146149886833695 R2 Score: 0.9957816866123151 RMSE: 0.078397 Entropy Value: 0.0005517608195709189
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.739105 |
| 0 | hospital_beds_per_thousand | 0.221891 |
| 2 | extreme_poverty | 0.032045 |
| 3 | gdp_per_capita | 0.005941 |
| 4 | population_density | 0.001018 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Ireland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.9 | 19.677 | 42.3 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.9 | 19.677 | 42.3 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.9 | 19.677 | 42.3 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.9 | 19.677 | 42.3 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.9 | 19.677 | 42.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.3 | 13.928 | 38.7 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.3 | 13.928 | 38.7 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.3 | 13.928 | 38.7 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.3 | 13.928 | 38.7 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.3 | 13.928 | 38.7 | 0.491388 |
2097 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9979884469227974
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005995659189568175 R2 Score: 0.9977638141467456 RMSE: 0.077432 Entropy Value: 0.0012403023391896217
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.826356 |
| 0 | cardiovasc_death_rate | 0.143178 |
| 2 | female_smokers | 0.015810 |
| 6 | median_age | 0.007248 |
| 3 | male_smokers | 0.004191 |
| 5 | aged_65_older | 0.002957 |
| 4 | life_expectancy | 0.000260 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Ireland'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.50 | 0.940 | 0.2 | 46682.515 | 136.520 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 2.50 | 0.940 | 0.2 | 46682.515 | 136.520 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 2.50 | 0.940 | 0.2 | 46682.515 | 136.520 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 2.50 | 0.940 | 0.2 | 46682.515 | 136.520 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 2.50 | 0.940 | 0.2 | 46682.515 | 136.520 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 0.491388 |
2097 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9981483428520672
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007388200063707959 R2 Score: 0.9972444416967158 RMSE: 0.085955 Entropy Value: 0.00131007217788142
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.956110 |
| 0 | hospital_beds_per_thousand | 0.023453 |
| 2 | extreme_poverty | 0.016011 |
| 3 | gdp_per_capita | 0.003906 |
| 4 | population_density | 0.000520 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Latvia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9979817144366736
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0018696016902774828 R2 Score: 0.9968998375395624 RMSE: 0.043239 Entropy Value: 0.0007273351883650379
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.954112 |
| 2 | female_smokers | 0.017664 |
| 0 | cardiovasc_death_rate | 0.012118 |
| 5 | aged_65_older | 0.010583 |
| 6 | median_age | 0.004349 |
| 3 | male_smokers | 0.000941 |
| 4 | life_expectancy | 0.000232 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Latvia'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 0.631969 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975189433093143
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002613839817756454 R2 Score: 0.9956657462802128 RMSE: 0.051126 Entropy Value: 0.0010067130210343615
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.960216 |
| 2 | extreme_poverty | 0.023054 |
| 0 | hospital_beds_per_thousand | 0.014476 |
| 3 | gdp_per_capita | 0.001788 |
| 4 | population_density | 0.000466 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9955159123177418
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003560841308279542 R2 Score: 0.9970021670932279 RMSE: 0.059673 Entropy Value: 0.0011802071599462333
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.517316 |
| 0 | cardiovasc_death_rate | 0.432531 |
| 6 | median_age | 0.019175 |
| 5 | aged_65_older | 0.014394 |
| 2 | female_smokers | 0.011943 |
| 3 | male_smokers | 0.003277 |
| 4 | life_expectancy | 0.001363 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 0.11011 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9953918346594799
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0030042606321511634 R2 Score: 0.9974707462074646 RMSE: 0.054811 Entropy Value: 0.0010085785334707671
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.941978 |
| 0 | hospital_beds_per_thousand | 0.035166 |
| 2 | extreme_poverty | 0.014051 |
| 3 | gdp_per_capita | 0.006777 |
| 4 | population_density | 0.002029 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Italy'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8377 | France | 1/25/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8378 | France | 1/26/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8379 | France | 1/27/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8380 | France | 1/28/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2135 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9950840810056313
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.07475212300690831 R2 Score: 0.995997908929556 RMSE: 0.273408 Entropy Value: 0.0007613902551123477
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.953673 |
| 2 | female_smokers | 0.024054 |
| 0 | cardiovasc_death_rate | 0.010520 |
| 5 | aged_65_older | 0.004307 |
| 3 | male_smokers | 0.004017 |
| 6 | median_age | 0.002808 |
| 4 | life_expectancy | 0.000621 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Italy'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 0.000000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 0.000000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 0.000000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 0.000000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.00 | 35220.084 | 205.859 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.00 | 35220.084 | 205.859 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.00 | 35220.084 | 205.859 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.00 | 35220.084 | 205.859 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.00 | 35220.084 | 205.859 | 0.735109 |
2135 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best CV score: 0.9949728012103314
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0891388770292683 R2 Score: 0.9952276686007263 RMSE: 0.298561 Entropy Value: 0.0009498782679573481
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.966568 |
| 2 | extreme_poverty | 0.022584 |
| 0 | hospital_beds_per_thousand | 0.006094 |
| 3 | gdp_per_capita | 0.004111 |
| 4 | population_density | 0.000643 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2100 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984656768745841
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010402460269562753 R2 Score: 0.9990051222850926 RMSE: 0.101992 Entropy Value: 0.00040629309095891516
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.973529 |
| 2 | female_smokers | 0.023056 |
| 3 | male_smokers | 0.001375 |
| 0 | cardiovasc_death_rate | 0.000910 |
| 5 | aged_65_older | 0.000438 |
| 4 | life_expectancy | 0.000347 |
| 6 | median_age | 0.000345 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 0.816005 |
2100 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984950385348217
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011447944113759524 R2 Score: 0.9989051335756014 RMSE: 0.106995 Entropy Value: 0.0004521099453512048
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.974035 |
| 2 | extreme_poverty | 0.023848 |
| 3 | gdp_per_capita | 0.001661 |
| 4 | population_density | 0.000405 |
| 0 | hospital_beds_per_thousand | 0.000052 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Spain'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2097 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998521297102364
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013814747661354383 R2 Score: 0.9974717098839665 RMSE: 0.117536 Entropy Value: 0.0004896303455702192
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.569929 |
| 0 | cardiovasc_death_rate | 0.229154 |
| 5 | aged_65_older | 0.161755 |
| 2 | female_smokers | 0.033937 |
| 6 | median_age | 0.002516 |
| 3 | male_smokers | 0.002434 |
| 4 | life_expectancy | 0.000275 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Spain'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 0.855148 |
2097 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9970653272478941
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.028816362882560887 R2 Score: 0.9947262065698224 RMSE: 0.169754 Entropy Value: 0.0011094469817051415
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.941568 |
| 2 | extreme_poverty | 0.040873 |
| 0 | hospital_beds_per_thousand | 0.012181 |
| 3 | gdp_per_capita | 0.004690 |
| 4 | population_density | 0.000688 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2091 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.997363058447263
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004452525636132693 R2 Score: 0.9978154993065472 RMSE: 0.066727 Entropy Value: 0.0005847514751188148
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.864046 |
| 1 | diabetes_prevalence | 0.091568 |
| 0 | cardiovasc_death_rate | 0.032643 |
| 5 | aged_65_older | 0.008213 |
| 2 | female_smokers | 0.002508 |
| 3 | male_smokers | 0.000722 |
| 4 | life_expectancy | 0.000301 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 0.536669 |
2091 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9972767647444323
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007179321254382132 R2 Score: 0.9964776772689534 RMSE: 0.084731 Entropy Value: 0.0007774232059793869
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.656420 |
| 0 | hospital_beds_per_thousand | 0.310241 |
| 2 | extreme_poverty | 0.024558 |
| 3 | gdp_per_capita | 0.008294 |
| 4 | population_density | 0.000487 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_firstCountryPairing = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_firstCountryPairing
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the population health index
df_firstCountryPairing = df_firstCountryPairing[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_firstCountryPairing = df_firstCountryPairing[df_firstCountryPairing['location'].isin([country1, country2])]
df_firstCountryPairing
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(1)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(7)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_firstCountryPairing['prev_day_mortality'] = df_firstCountryPairing['prev_day_mortality'].fillna(0)
df_firstCountryPairing['prev_week_mortality'] = df_firstCountryPairing['prev_week_mortality'].fillna(0)
df_firstCountryPairing['prev_month_mortality'] = df_firstCountryPairing['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_firstCountryPairing.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_firstCountryPairing.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_firstCountryPairing['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_firstCountryPairing = pd.get_dummies(df_firstCountryPairing, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_firstCountryPairing['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9639563497859113
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.483882915680185 R2 Score: 0.980047429301368 RMSE: 0.695617 Entropy Value: 0.005987538685788008
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.770054 |
| 1 | diabetes_prevalence | 0.089675 |
| 5 | aged_65_older | 0.045220 |
| 6 | median_age | 0.031215 |
| 2 | female_smokers | 0.026417 |
| 3 | male_smokers | 0.023605 |
| 4 | life_expectancy | 0.013812 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 1.084791 |
2136 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.9549424661572783
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.9691062567746922 R2 Score: 0.9600395871063083 RMSE: 0.984432 Entropy Value: 0.008653279765116838
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.868170 |
| 2 | extreme_poverty | 0.072435 |
| 0 | hospital_beds_per_thousand | 0.026159 |
| 4 | population_density | 0.021088 |
| 3 | gdp_per_capita | 0.012148 |
# Country Pair by Pair Analysis relative to cardiovascular death rate
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on cardiovascular death rate (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Portugal = df[(df.location == "Portugal")]
df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Latvia = df[(df.location == "Latvia")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9982254364422947
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013887676064707104 R2 Score: 0.998827031715265 RMSE: 0.117846 Entropy Value: 0.0007324096000498459
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.946862 |
| 5 | median_age | 0.040970 |
| 2 | male_smokers | 0.004957 |
| 3 | life_expectancy | 0.004735 |
| 1 | female_smokers | 0.002206 |
| 4 | aged_65_older | 0.000270 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9978932925070829
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008856177569188209 R2 Score: 0.9992519975722188 RMSE: 0.094107 Entropy Value: 0.000433455898033473
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.927909 |
| 2 | extreme_poverty | 0.038889 |
| 5 | population | 0.030571 |
| 3 | gdp_per_capita | 0.002263 |
| 4 | population_density | 0.000344 |
| 0 | hospital_beds_per_thousand | 0.000024 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983737881892208
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0024167910786875983 R2 Score: 0.9992894760777331 RMSE: 0.049161 Entropy Value: 0.0002770358074043776
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.760003 |
| 1 | female_smokers | 0.112288 |
| 5 | median_age | 0.086465 |
| 2 | male_smokers | 0.037642 |
| 3 | life_expectancy | 0.003177 |
| 4 | aged_65_older | 0.000424 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9989558784228938
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0032162020605789846 R2 Score: 0.9990544534349546 RMSE: 0.056712 Entropy Value: 0.000341364428693161
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.515688 |
| 1 | human_development_index | 0.452145 |
| 2 | extreme_poverty | 0.028238 |
| 3 | gdp_per_capita | 0.003655 |
| 4 | population_density | 0.000192 |
| 0 | hospital_beds_per_thousand | 0.000083 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985717047514587
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007552765911951941 R2 Score: 0.9957154647674932 RMSE: 0.086907 Entropy Value: 0.0014811041103746996
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.949543 |
| 2 | male_smokers | 0.035863 |
| 1 | female_smokers | 0.008964 |
| 3 | life_expectancy | 0.004204 |
| 5 | median_age | 0.000997 |
| 4 | aged_65_older | 0.000429 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989156833724155
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008329520038950185 R2 Score: 0.995274827461516 RMSE: 0.091266 Entropy Value: 0.0016508456649447665
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.956574 |
| 2 | extreme_poverty | 0.028531 |
| 5 | population | 0.008398 |
| 3 | gdp_per_capita | 0.006081 |
| 4 | population_density | 0.000371 |
| 0 | hospital_beds_per_thousand | 0.000044 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8377 | France | 1/25/2020 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8378 | France | 1/26/2020 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8379 | France | 1/27/2020 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8380 | France | 1/28/2020 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9911158420617596
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.12607663768274116 R2 Score: 0.9899744938655387 RMSE: 0.355073 Entropy Value: 0.0030851096784643584
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.534321 |
| 1 | female_smokers | 0.332868 |
| 0 | diabetes_prevalence | 0.117862 |
| 2 | male_smokers | 0.010450 |
| 3 | life_expectancy | 0.003187 |
| 4 | aged_65_older | 0.001313 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9906321989356222
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.1263773231369439 R2 Score: 0.9899505836160186 RMSE: 0.355496 Entropy Value: 0.0032779127594959943
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.958053 |
| 2 | extreme_poverty | 0.017445 |
| 5 | population | 0.014344 |
| 3 | gdp_per_capita | 0.005157 |
| 0 | hospital_beds_per_thousand | 0.004252 |
| 4 | population_density | 0.000749 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983164837805587
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00685068236066425 R2 Score: 0.9994372009296628 RMSE: 0.082769 Entropy Value: 0.0003107010743677336
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.517556 |
| 0 | diabetes_prevalence | 0.329949 |
| 1 | female_smokers | 0.142975 |
| 2 | male_smokers | 0.006820 |
| 3 | life_expectancy | 0.002311 |
| 4 | aged_65_older | 0.000389 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986413470709365
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013803653998387413 R2 Score: 0.9988659985635657 RMSE: 0.117489 Entropy Value: 0.0004399732479586434
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.937618 |
| 2 | extreme_poverty | 0.027005 |
| 5 | population | 0.023078 |
| 0 | hospital_beds_per_thousand | 0.008608 |
| 3 | gdp_per_capita | 0.002772 |
| 4 | population_density | 0.000918 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984742718532862
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008485613991385917 R2 Score: 0.9988940462273057 RMSE: 0.092117 Entropy Value: 0.0013446805914148691
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.729067 |
| 0 | diabetes_prevalence | 0.212295 |
| 2 | male_smokers | 0.034604 |
| 1 | female_smokers | 0.021167 |
| 3 | life_expectancy | 0.002298 |
| 4 | aged_65_older | 0.000569 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985273943931006
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007512088981123421 R2 Score: 0.9990209284610492 RMSE: 0.086672 Entropy Value: 0.0009194530556352238
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.943109 |
| 2 | extreme_poverty | 0.037435 |
| 5 | population | 0.012103 |
| 3 | gdp_per_capita | 0.006234 |
| 0 | hospital_beds_per_thousand | 0.000694 |
| 4 | population_density | 0.000425 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Spain'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2097 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974493361773998
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.029155220822653562 R2 Score: 0.9946641908745901 RMSE: 0.170749 Entropy Value: 0.001250556072636702
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.865528 |
| 1 | female_smokers | 0.064998 |
| 5 | median_age | 0.038122 |
| 2 | male_smokers | 0.021243 |
| 3 | life_expectancy | 0.009762 |
| 4 | aged_65_older | 0.000348 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Spain'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2097 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9967905002141979
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.02264676574464689 R2 Score: 0.9958553282770057 RMSE: 0.150488 Entropy Value: 0.0008599665585553152
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.929110 |
| 2 | extreme_poverty | 0.039519 |
| 5 | population | 0.027822 |
| 3 | gdp_per_capita | 0.001728 |
| 0 | hospital_beds_per_thousand | 0.001403 |
| 4 | population_density | 0.000418 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.996177252695678
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.026225471602158636 R2 Score: 0.9949896669761966 RMSE: 0.161943 Entropy Value: 0.0011876312272444455
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.812203 |
| 0 | diabetes_prevalence | 0.156773 |
| 2 | male_smokers | 0.018060 |
| 5 | median_age | 0.007554 |
| 3 | life_expectancy | 0.004384 |
| 4 | aged_65_older | 0.001026 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9964944545383589
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.025985308865571106 R2 Score: 0.9950355496702608 RMSE: 0.161200 Entropy Value: 0.0009864073077925806
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.938681 |
| 5 | population | 0.038091 |
| 2 | extreme_poverty | 0.018932 |
| 3 | gdp_per_capita | 0.003068 |
| 4 | population_density | 0.000901 |
| 0 | hospital_beds_per_thousand | 0.000327 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9638881170030213
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 1.307896296891642 R2 Score: 0.9460698187834822 RMSE: 1.143633 Entropy Value: 0.007993738527703239
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.848283 |
| 1 | female_smokers | 0.041961 |
| 5 | median_age | 0.040427 |
| 3 | life_expectancy | 0.032823 |
| 2 | male_smokers | 0.019508 |
| 4 | aged_65_older | 0.016998 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9638248976625985
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.908396956844949 R2 Score: 0.9625428922647679 RMSE: 0.953099 Entropy Value: 0.007020342279931816
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.840735 |
| 2 | extreme_poverty | 0.059561 |
| 5 | population | 0.058444 |
| 3 | gdp_per_capita | 0.032403 |
| 4 | population_density | 0.008650 |
| 0 | hospital_beds_per_thousand | 0.000206 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Estonia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
2095 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9976974336489285
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0014714686404337467 R2 Score: 0.9978634041885316 RMSE: 0.038360 Entropy Value: 0.000335000984342334
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.768576 |
| 0 | diabetes_prevalence | 0.169903 |
| 2 | male_smokers | 0.049100 |
| 5 | median_age | 0.009381 |
| 3 | life_expectancy | 0.002434 |
| 4 | aged_65_older | 0.000607 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Estonia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
2095 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9968447535278674
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0019107375406635276 R2 Score: 0.9972255787761853 RMSE: 0.043712 Entropy Value: 0.0005664584928279815
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.927005 |
| 2 | extreme_poverty | 0.053546 |
| 3 | gdp_per_capita | 0.010799 |
| 5 | population | 0.005969 |
| 0 | hospital_beds_per_thousand | 0.002133 |
| 4 | population_density | 0.000548 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9970316226679301
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007095434014830886 R2 Score: 0.9965188340747636 RMSE: 0.084234 Entropy Value: 0.0009978441951363514
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.556932 |
| 0 | diabetes_prevalence | 0.407805 |
| 2 | male_smokers | 0.023935 |
| 3 | life_expectancy | 0.006117 |
| 5 | median_age | 0.004886 |
| 4 | aged_65_older | 0.000324 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9967388889181926
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010420033179699867 R2 Score: 0.9948877173166316 RMSE: 0.102079 Entropy Value: 0.0011192503090849918
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.891315 |
| 5 | population | 0.082122 |
| 2 | extreme_poverty | 0.022988 |
| 3 | gdp_per_capita | 0.002865 |
| 4 | population_density | 0.000641 |
| 0 | hospital_beds_per_thousand | 0.000069 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Latvia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2065 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9520281114433315
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0034538780402943816 R2 Score: 0.9980017666453213 RMSE: 0.058770 Entropy Value: 0.0005870161018070032
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.784650 |
| 5 | median_age | 0.136987 |
| 2 | male_smokers | 0.036838 |
| 3 | life_expectancy | 0.022305 |
| 1 | female_smokers | 0.013887 |
| 4 | aged_65_older | 0.005333 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Latvia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2065 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9544889135900533
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0058068592192322644 R2 Score: 0.9966404546882018 RMSE: 0.076203 Entropy Value: 0.000744361683555891
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.647422 |
| 0 | hospital_beds_per_thousand | 0.258173 |
| 2 | extreme_poverty | 0.035262 |
| 1 | human_development_index | 0.027633 |
| 3 | gdp_per_capita | 0.025970 |
| 4 | population_density | 0.005540 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9965278391641423
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002681968001128092 R2 Score: 0.9984385356636368 RMSE: 0.051788 Entropy Value: 0.0006055225602219516
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | diabetes_prevalence | 0.756724 |
| 5 | median_age | 0.195600 |
| 1 | female_smokers | 0.036499 |
| 2 | male_smokers | 0.006774 |
| 3 | life_expectancy | 0.003531 |
| 4 | aged_65_older | 0.000872 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9942151184753699
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003107698404524249 R2 Score: 0.9981906718406797 RMSE: 0.055747 Entropy Value: 0.0004506715934095217
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.753233 |
| 1 | human_development_index | 0.216832 |
| 2 | extreme_poverty | 0.021982 |
| 3 | gdp_per_capita | 0.007136 |
| 4 | population_density | 0.000814 |
| 0 | hospital_beds_per_thousand | 0.000003 |
# Country Pair by Pair Analysis relative to male smokers
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on male smokers (13 pairs of countries)
df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]
df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Spain = df[(df.location == "Spain")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Latvia = df[(df.location == "Latvia")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 80.90 | 19.677 | 42.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 82.43 | 16.984 | 41.4 | 1.093162 |
2134 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9985893709741042
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0035792870495897117 R2 Score: 0.999145763988884 RMSE: 0.059827 Entropy Value: 0.0004571636325590867
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.742289 |
| 0 | cardiovasc_death_rate | 0.224611 |
| 2 | female_smokers | 0.019414 |
| 5 | median_age | 0.011607 |
| 3 | life_expectancy | 0.001914 |
| 4 | aged_65_older | 0.000164 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.5 | 0.940 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 2.5 | 0.940 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 2.5 | 0.940 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 2.5 | 0.940 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 2.5 | 0.940 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2134 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989520477467734
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0026540444143001835 R2 Score: 0.9993665832657773 RMSE: 0.051517 Entropy Value: 0.00039769797240609057
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.927082 |
| 5 | population | 0.037657 |
| 2 | extreme_poverty | 0.019357 |
| 0 | hospital_beds_per_thousand | 0.014506 |
| 3 | gdp_per_capita | 0.001192 |
| 4 | population_density | 0.000205 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 81.91 | 21.228 | 42.8 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 82.99 | 14.431 | 37.3 | 0.11011 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9964956178544988
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0032498598374813537 R2 Score: 0.9972639789533597 RMSE: 0.057008 Entropy Value: 0.0008594876046729041
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.520148 |
| 0 | cardiovasc_death_rate | 0.437998 |
| 5 | median_age | 0.025765 |
| 2 | female_smokers | 0.012087 |
| 3 | life_expectancy | 0.002962 |
| 4 | aged_65_older | 0.001041 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9962131899966675
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002658355780630809 R2 Score: 0.9977619596754977 RMSE: 0.051559 Entropy Value: 0.0007340555943675478
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.899612 |
| 5 | population | 0.064346 |
| 0 | hospital_beds_per_thousand | 0.017225 |
| 2 | extreme_poverty | 0.013530 |
| 3 | gdp_per_capita | 0.003408 |
| 4 | population_density | 0.001880 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 82.25 | 14.312 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 82.30 | 13.928 | 38.7 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9981898039474573
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0024182536451926267 R2 Score: 0.9989409896781432 RMSE: 0.049176 Entropy Value: 0.0004535153217103071
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.612473 |
| 0 | cardiovasc_death_rate | 0.345076 |
| 2 | female_smokers | 0.026558 |
| 1 | diabetes_prevalence | 0.013996 |
| 3 | life_expectancy | 0.001551 |
| 4 | aged_65_older | 0.000346 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998351364576265
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001882911265242772 R2 Score: 0.9991754287359407 RMSE: 0.043393 Entropy Value: 0.0004605212242426924
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.523426 |
| 1 | human_development_index | 0.437981 |
| 2 | extreme_poverty | 0.036239 |
| 3 | gdp_per_capita | 0.001743 |
| 4 | population_density | 0.000594 |
| 0 | hospital_beds_per_thousand | 0.000017 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 81.32 | 19.062 | 44.5 | 0.536669 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9990369085942957
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004094213219663488 R2 Score: 0.9995002849901965 RMSE: 0.063986 Entropy Value: 0.00025517686621591407
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.932248 |
| 0 | cardiovasc_death_rate | 0.037284 |
| 2 | female_smokers | 0.025795 |
| 5 | median_age | 0.003006 |
| 3 | life_expectancy | 0.001421 |
| 4 | aged_65_older | 0.000247 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Slovenia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989307334211548
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00559759095128752 R2 Score: 0.9993167917577755 RMSE: 0.074817 Entropy Value: 0.00043829864165291835
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.967464 |
| 2 | extreme_poverty | 0.027111 |
| 5 | population | 0.002733 |
| 3 | gdp_per_capita | 0.001380 |
| 0 | hospital_beds_per_thousand | 0.000901 |
| 4 | population_density | 0.000412 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United Kingdom'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 82.80 | 19.985 | 41.0 | 0.816005 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9647620002703616
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.19934385811888447 R2 Score: 0.9926835029165212 RMSE: 0.446479 Entropy Value: 0.002995808129382941
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.821076 |
| 5 | median_age | 0.086721 |
| 1 | diabetes_prevalence | 0.033801 |
| 3 | life_expectancy | 0.032590 |
| 2 | female_smokers | 0.018086 |
| 4 | aged_65_older | 0.007726 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United Kingdom'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9657040062307898
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.1755698880186287 R2 Score: 0.9935560764913614 RMSE: 0.419011 Entropy Value: 0.0030443221086945424
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.795197 |
| 5 | population | 0.117897 |
| 2 | extreme_poverty | 0.038008 |
| 3 | gdp_per_capita | 0.030219 |
| 4 | population_density | 0.018509 |
| 0 | hospital_beds_per_thousand | 0.000171 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Austria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 78.86 | 15.413 | 38.3 | 1.084791 |
2112 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9707123801174072
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.015612908849576288 R2 Score: 0.9900116946003525 RMSE: 0.124952 Entropy Value: 0.0008956642056999705
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.884760 |
| 0 | cardiovasc_death_rate | 0.061283 |
| 2 | female_smokers | 0.022979 |
| 4 | aged_65_older | 0.015255 |
| 3 | life_expectancy | 0.008161 |
| 5 | median_age | 0.007562 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Austria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2112 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9748898922979944
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011356761380877888 R2 Score: 0.9927345504853691 RMSE: 0.106568 Entropy Value: 0.0008026741253104213
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.941349 |
| 2 | extreme_poverty | 0.024552 |
| 5 | population | 0.015274 |
| 4 | population_density | 0.010674 |
| 3 | gdp_per_capita | 0.007128 |
| 0 | hospital_beds_per_thousand | 0.001023 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Czechia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 6.82 | 30.5 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 6.82 | 30.5 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 6.82 | 30.5 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 6.82 | 30.5 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 6.82 | 30.5 | 79.38 | 19.027 | 43.3 | 0.919575 |
2094 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9981564421259727
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.021770315476854027 R2 Score: 0.9980743421157474 RMSE: 0.147548 Entropy Value: 0.0008394924242857518
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.774151 |
| 0 | cardiovasc_death_rate | 0.164452 |
| 2 | female_smokers | 0.036317 |
| 5 | median_age | 0.022970 |
| 3 | life_expectancy | 0.001647 |
| 4 | aged_65_older | 0.000464 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Czechia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919575 |
2094 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9981977178578196
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.018443397054300942 R2 Score: 0.9983686192794139 RMSE: 0.135806 Entropy Value: 0.000594716015312401
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.939053 |
| 2 | extreme_poverty | 0.038798 |
| 5 | population | 0.014107 |
| 0 | hospital_beds_per_thousand | 0.004909 |
| 3 | gdp_per_capita | 0.002483 |
| 4 | population_density | 0.000649 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 78.74 | 19.452 | 42.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 82.66 | 19.718 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 82.66 | 19.718 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 82.66 | 19.718 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 82.66 | 19.718 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 82.66 | 19.718 | 42.0 | 0.411892 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.99396874660859
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.09053132731245286 R2 Score: 0.9905744145265692 RMSE: 0.300884 Entropy Value: 0.005811101609293169
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.805098 |
| 0 | cardiovasc_death_rate | 0.155346 |
| 2 | female_smokers | 0.018175 |
| 5 | median_age | 0.016554 |
| 3 | life_expectancy | 0.003922 |
| 4 | aged_65_older | 0.000905 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411892 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9924204242316919
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.09564363505394957 R2 Score: 0.9900421513308949 RMSE: 0.309263 Entropy Value: 0.005800126883421082
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.929469 |
| 5 | population | 0.033558 |
| 2 | extreme_poverty | 0.029972 |
| 3 | gdp_per_capita | 0.003907 |
| 0 | hospital_beds_per_thousand | 0.002285 |
| 4 | population_density | 0.000808 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Portugal'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2098 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9992491538823527
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.03916584085137451 R2 Score: 0.9965620865525731 RMSE: 0.197904 Entropy Value: 0.0019471953731532649
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.731503 |
| 5 | median_age | 0.189648 |
| 1 | diabetes_prevalence | 0.051121 |
| 2 | female_smokers | 0.025325 |
| 3 | life_expectancy | 0.002138 |
| 4 | aged_65_older | 0.000265 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Portugal'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2098 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9991962488532368
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0346698323414571 R2 Score: 0.996956738825574 RMSE: 0.186198 Entropy Value: 0.0012806717112898633
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.923655 |
| 5 | population | 0.047005 |
| 2 | extreme_poverty | 0.026004 |
| 3 | gdp_per_capita | 0.003020 |
| 4 | population_density | 0.000296 |
| 0 | hospital_beds_per_thousand | 0.000020 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 10.08 | 37.7 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 10.08 | 37.7 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 10.08 | 37.7 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 10.08 | 37.7 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 10.08 | 37.7 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 76.05 | 17.850 | 43.0 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9963303624947721
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002100050408732467 R2 Score: 0.9987773329821902 RMSE: 0.045826 Entropy Value: 0.0002089676106423466
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.598715 |
| 5 | median_age | 0.242721 |
| 1 | diabetes_prevalence | 0.134221 |
| 2 | female_smokers | 0.020674 |
| 3 | life_expectancy | 0.003062 |
| 4 | aged_65_older | 0.000607 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9942151184753699
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003107698404524249 R2 Score: 0.9981906718406797 RMSE: 0.055747 Entropy Value: 0.0004506715934095217
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.753233 |
| 1 | human_development_index | 0.216832 |
| 2 | extreme_poverty | 0.021982 |
| 3 | gdp_per_capita | 0.007136 |
| 4 | population_density | 0.000814 |
| 0 | hospital_beds_per_thousand | 0.000003 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Spain'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2092 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9993652655177874
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002584137818108158 R2 Score: 0.9995264678213833 RMSE: 0.050834 Entropy Value: 0.0002068980142316218
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.982145 |
| 1 | diabetes_prevalence | 0.009200 |
| 2 | female_smokers | 0.004551 |
| 0 | cardiovasc_death_rate | 0.003429 |
| 3 | life_expectancy | 0.000589 |
| 4 | aged_65_older | 0.000087 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Spain'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2092 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975963885456272
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008134240357790014 R2 Score: 0.9985094353207385 RMSE: 0.090190 Entropy Value: 0.0006655143831412369
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 8.879057e-01 |
| 5 | population | 6.994735e-02 |
| 2 | extreme_poverty | 4.072583e-02 |
| 3 | gdp_per_capita | 1.082704e-03 |
| 4 | population_density | 3.382074e-04 |
| 0 | hospital_beds_per_thousand | 1.596907e-07 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Bulgaria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 99.739 | 5.59 | 22.6 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 99.739 | 5.59 | 22.6 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 99.739 | 5.59 | 22.6 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 99.739 | 5.59 | 22.6 | 83.78 | 18.436 | 43.1 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 99.739 | 5.59 | 22.6 | 83.78 | 18.436 | 43.1 | 0.322149 |
2066 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9590497939781765
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004896549532936876 R2 Score: 0.9980297240563568 RMSE: 0.069975 Entropy Value: 0.0006950015035866526
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.776305 |
| 1 | diabetes_prevalence | 0.101123 |
| 5 | median_age | 0.059598 |
| 2 | female_smokers | 0.031843 |
| 3 | life_expectancy | 0.017355 |
| 4 | aged_65_older | 0.013777 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Bulgaria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 4.530 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 4.530 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 4.530 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 4.530 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 4.530 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322149 |
2066 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.956167269462329
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005126255663843806 R2 Score: 0.9979372947935078 RMSE: 0.071598 Entropy Value: 0.0006230810700906027
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.595190 |
| 5 | population | 0.208472 |
| 0 | hospital_beds_per_thousand | 0.114103 |
| 2 | extreme_poverty | 0.043787 |
| 3 | gdp_per_capita | 0.025727 |
| 4 | population_density | 0.012720 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Latvia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 75.29 | 19.754 | 43.9 | 0.631969 |
2065 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9920063450501019
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0009323680911373003 R2 Score: 0.9981522797865945 RMSE: 0.030535 Entropy Value: 0.0003241275435360705
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.822942 |
| 0 | cardiovasc_death_rate | 0.117464 |
| 2 | female_smokers | 0.032022 |
| 5 | median_age | 0.022714 |
| 3 | life_expectancy | 0.002734 |
| 4 | aged_65_older | 0.002124 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Latvia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2065 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9928418947763469
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0007558023623301819 R2 Score: 0.9985021888720862 RMSE: 0.027492 Entropy Value: 0.00025084148640829615
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.828245 |
| 5 | population | 0.132583 |
| 2 | extreme_poverty | 0.033619 |
| 3 | gdp_per_capita | 0.003182 |
| 4 | population_density | 0.002366 |
| 0 | hospital_beds_per_thousand | 0.000005 |
# Country Pair by Pair Analysis relative to female smokers
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on female smokers (13 pairs of countries)
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]
df_Ireland = df[(df.location == "Ireland")]
df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Romania = df[(df.location == "Romania")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Spain = df[(df.location == "Spain")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Bulgaria= df[(df.location == "Bulgaria")]
df_Czechia = df[(df.location == "Czechia")]
df_France = df[(df.location == "France")]
df_Serbia = df[(df.location == "Serbia")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9982903099276361
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002818832110675199 R2 Score: 0.9991712781195069 RMSE: 0.053093 Entropy Value: 0.00033073646040135244
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.802712 |
| 1 | diabetes_prevalence | 0.121281 |
| 2 | male_smokers | 0.037316 |
| 5 | median_age | 0.034987 |
| 3 | life_expectancy | 0.003115 |
| 4 | aged_65_older | 0.000589 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9989558784228938
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0032162020605789846 R2 Score: 0.9990544534349546 RMSE: 0.056712 Entropy Value: 0.000341364428693161
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.515688 |
| 1 | human_development_index | 0.452145 |
| 2 | extreme_poverty | 0.028238 |
| 3 | gdp_per_capita | 0.003655 |
| 4 | population_density | 0.000192 |
| 0 | hospital_beds_per_thousand | 0.000083 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 5.76 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 153.507 | 5.76 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 153.507 | 5.76 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 153.507 | 5.76 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 153.507 | 5.76 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989113781858382
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008523656273191014 R2 Score: 0.9951646978023675 RMSE: 0.092324 Entropy Value: 0.0017240442447165512
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.955970 |
| 2 | male_smokers | 0.028467 |
| 3 | life_expectancy | 0.006111 |
| 5 | median_age | 0.005548 |
| 0 | cardiovasc_death_rate | 0.003586 |
| 4 | aged_65_older | 0.000319 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989156833724155
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008329520038950185 R2 Score: 0.995274827461516 RMSE: 0.091266 Entropy Value: 0.0016508456649447665
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.956574 |
| 2 | extreme_poverty | 0.028531 |
| 5 | population | 0.008398 |
| 3 | gdp_per_capita | 0.006081 |
| 4 | population_density | 0.000371 |
| 0 | hospital_beds_per_thousand | 0.000044 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Italy'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 20911 | Iceland | 2/28/2020 | 117.992 | 5.31 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20912 | Iceland | 2/29/2020 | 117.992 | 5.31 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20913 | Iceland | 3/1/2020 | 117.992 | 5.31 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20914 | Iceland | 3/2/2020 | 117.992 | 5.31 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20915 | Iceland | 3/3/2020 | 117.992 | 5.31 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986885167827337
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.025865966557999595 R2 Score: 0.9978789245215064 RMSE: 0.160829 Entropy Value: 0.001439826222505498
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.493613 |
| 1 | diabetes_prevalence | 0.469690 |
| 5 | median_age | 0.024296 |
| 2 | male_smokers | 0.010599 |
| 3 | life_expectancy | 0.001623 |
| 4 | aged_65_older | 0.000179 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Italy'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 20911 | Iceland | 2/28/2020 | 2.91 | 0.949 | 0.2 | 46482.958 | 3.404 | 372903 | 0.000000 |
| 20912 | Iceland | 2/29/2020 | 2.91 | 0.949 | 0.2 | 46482.958 | 3.404 | 372903 | 0.000000 |
| 20913 | Iceland | 3/1/2020 | 2.91 | 0.949 | 0.2 | 46482.958 | 3.404 | 372903 | 0.000000 |
| 20914 | Iceland | 3/2/2020 | 2.91 | 0.949 | 0.2 | 46482.958 | 3.404 | 372903 | 0.000000 |
| 20915 | Iceland | 3/3/2020 | 2.91 | 0.949 | 0.2 | 46482.958 | 3.404 | 372903 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.999041254481315
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.020316963384106206 R2 Score: 0.9983339569880426 RMSE: 0.142538 Entropy Value: 0.001115730684759821
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.897053 |
| 5 | population | 0.078632 |
| 2 | extreme_poverty | 0.022551 |
| 3 | gdp_per_capita | 0.001409 |
| 4 | population_density | 0.000254 |
| 0 | hospital_beds_per_thousand | 0.000100 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998368534343579
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0031857365835024896 R2 Score: 0.9984052886270314 RMSE: 0.056442 Entropy Value: 0.00046176412460026606
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.932445 |
| 0 | cardiovasc_death_rate | 0.036452 |
| 2 | male_smokers | 0.027502 |
| 5 | median_age | 0.002234 |
| 3 | life_expectancy | 0.001093 |
| 4 | aged_65_older | 0.000274 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.998167144839121
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0038490153568325174 R2 Score: 0.9980732655059875 RMSE: 0.062040 Entropy Value: 0.000510878761187502
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.964288 |
| 2 | extreme_poverty | 0.029185 |
| 5 | population | 0.004776 |
| 3 | gdp_per_capita | 0.001256 |
| 4 | population_density | 0.000316 |
| 0 | hospital_beds_per_thousand | 0.000179 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United Kingdom'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9636756360711152
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.3274210506772977 R2 Score: 0.9879826989155525 RMSE: 0.572207 Entropy Value: 0.0035336135372386534
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.820240 |
| 5 | median_age | 0.074668 |
| 2 | male_smokers | 0.031916 |
| 1 | diabetes_prevalence | 0.031298 |
| 3 | life_expectancy | 0.027753 |
| 4 | aged_65_older | 0.014125 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United Kingdom'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9657040062307898
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.1755698880186287 R2 Score: 0.9935560764913614 RMSE: 0.419011 Entropy Value: 0.0030443221086945424
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.795197 |
| 5 | population | 0.117897 |
| 2 | extreme_poverty | 0.038008 |
| 3 | gdp_per_capita | 0.030219 |
| 4 | population_density | 0.018509 |
| 0 | hospital_beds_per_thousand | 0.000171 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Austria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2112 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9726217165926947
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011712039224039057 R2 Score: 0.9925072626920814 RMSE: 0.108222 Entropy Value: 0.0007421212804027486
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.885072 |
| 0 | cardiovasc_death_rate | 0.064410 |
| 2 | male_smokers | 0.022757 |
| 4 | aged_65_older | 0.011487 |
| 5 | median_age | 0.008366 |
| 3 | life_expectancy | 0.007908 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Austria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2112 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9748898922979944
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011356761380877888 R2 Score: 0.9927345504853691 RMSE: 0.106568 Entropy Value: 0.0008026741253104213
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.941349 |
| 2 | extreme_poverty | 0.024552 |
| 5 | population | 0.015274 |
| 4 | population_density | 0.010674 |
| 3 | gdp_per_capita | 0.007128 |
| 0 | hospital_beds_per_thousand | 0.001023 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 255.569 | 4.02 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 255.569 | 4.02 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 255.569 | 4.02 | 39.3 | 78.74 | 19.452 | 42.7 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 255.569 | 4.02 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 255.569 | 4.02 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
2121 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998306333473295
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.017273039382732797 R2 Score: 0.9986315561079507 RMSE: 0.131427 Entropy Value: 0.000694873976607357
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.723162 |
| 0 | cardiovasc_death_rate | 0.231758 |
| 2 | male_smokers | 0.038044 |
| 5 | median_age | 0.004786 |
| 3 | life_expectancy | 0.001834 |
| 4 | aged_65_older | 0.000415 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
2121 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984095979233917
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.016179009133714634 R2 Score: 0.9987182298530174 RMSE: 0.127197 Entropy Value: 0.0006399632348900017
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.932614 |
| 2 | extreme_poverty | 0.038249 |
| 5 | population | 0.019770 |
| 0 | hospital_beds_per_thousand | 0.005374 |
| 3 | gdp_per_capita | 0.003504 |
| 4 | population_density | 0.000490 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Latvia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 3.28 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 3.28 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 3.28 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 3.28 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 3.28 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2073 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998374045977475
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003030155883805974 R2 Score: 0.9986252667840227 RMSE: 0.055047 Entropy Value: 0.0004986576827960621
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.747948 |
| 0 | cardiovasc_death_rate | 0.211509 |
| 2 | male_smokers | 0.032826 |
| 5 | median_age | 0.003667 |
| 3 | life_expectancy | 0.003526 |
| 4 | aged_65_older | 0.000523 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Latvia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2073 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9979400711573009
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004705897262529415 R2 Score: 0.9978650097467429 RMSE: 0.068600 Entropy Value: 0.0007148305943023663
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.945415 |
| 2 | extreme_poverty | 0.036526 |
| 5 | population | 0.010191 |
| 0 | hospital_beds_per_thousand | 0.003836 |
| 3 | gdp_per_capita | 0.003491 |
| 4 | population_density | 0.000540 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 128.275 | 4.42 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 128.275 | 4.42 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 128.275 | 4.42 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 128.275 | 4.42 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 128.275 | 4.42 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986500446225092
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009737687060063437 R2 Score: 0.9987308600470954 RMSE: 0.098680 Entropy Value: 0.001010970627044727
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.761081 |
| 0 | cardiovasc_death_rate | 0.183282 |
| 2 | male_smokers | 0.032007 |
| 5 | median_age | 0.018832 |
| 3 | life_expectancy | 0.003997 |
| 4 | aged_65_older | 0.000802 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985273943931006
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007512088981123421 R2 Score: 0.9990209284610492 RMSE: 0.086672 Entropy Value: 0.0009194530556352238
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.943109 |
| 2 | extreme_poverty | 0.037435 |
| 5 | population | 0.012103 |
| 3 | gdp_per_capita | 0.006234 |
| 0 | hospital_beds_per_thousand | 0.000694 |
| 4 | population_density | 0.000425 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Slovakia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 37.1 | 76.05 | 17.85 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9980209439218296
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0025858758735811936 R2 Score: 0.998541908086158 RMSE: 0.050852 Entropy Value: 0.00025608225493814756
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.635662 |
| 0 | cardiovasc_death_rate | 0.191981 |
| 1 | diabetes_prevalence | 0.151447 |
| 2 | male_smokers | 0.017079 |
| 3 | life_expectancy | 0.003517 |
| 4 | aged_65_older | 0.000314 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Slovakia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.820 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.820 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.820 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.820 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.820 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9947150727401987
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0023029360576114406 R2 Score: 0.9987014487129855 RMSE: 0.047989 Entropy Value: 0.00021180348784876973
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.646503 |
| 1 | human_development_index | 0.328172 |
| 2 | extreme_poverty | 0.018557 |
| 3 | gdp_per_capita | 0.005316 |
| 4 | population_density | 0.000793 |
| 0 | hospital_beds_per_thousand | 0.000660 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Switzerland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9988894734018896
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008646276456698085 R2 Score: 0.998600555703449 RMSE: 0.092985 Entropy Value: 0.00042564211305673786
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.969299 |
| 2 | male_smokers | 0.017999 |
| 5 | median_age | 0.005651 |
| 1 | diabetes_prevalence | 0.004935 |
| 3 | life_expectancy | 0.001889 |
| 4 | aged_65_older | 0.000228 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Switzerland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.00 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.00 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.00 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.00 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.00 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984087201734722
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009004477715309808 R2 Score: 0.9985425789881667 RMSE: 0.094892 Entropy Value: 0.0005624411655893531
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.946716 |
| 5 | population | 0.027480 |
| 2 | extreme_poverty | 0.023142 |
| 3 | gdp_per_capita | 0.002243 |
| 4 | population_density | 0.000363 |
| 0 | hospital_beds_per_thousand | 0.000055 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Czechia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 6.82 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 6.82 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 6.82 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 6.82 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 6.82 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919575 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.946009198558348
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0036901120196825726 R2 Score: 0.9977493146946407 RMSE: 0.060746 Entropy Value: 0.00036888555731396776
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.830856 |
| 5 | median_age | 0.065756 |
| 3 | life_expectancy | 0.034074 |
| 2 | male_smokers | 0.031315 |
| 1 | diabetes_prevalence | 0.021356 |
| 4 | aged_65_older | 0.016643 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Czechia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919575 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9509416306303644
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003709674750284998 R2 Score: 0.9977373829294084 RMSE: 0.060907 Entropy Value: 0.0006342010482872406
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.598261 |
| 0 | hospital_beds_per_thousand | 0.292702 |
| 2 | extreme_poverty | 0.046316 |
| 3 | gdp_per_capita | 0.031321 |
| 1 | human_development_index | 0.026994 |
| 4 | population_density | 0.004406 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Serbia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.060 | 4.77 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8377 | France | 1/25/2020 | 86.060 | 4.77 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8378 | France | 1/26/2020 | 86.060 | 4.77 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8379 | France | 1/27/2020 | 86.060 | 4.77 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8380 | France | 1/28/2020 | 86.060 | 4.77 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 40.2 | 76.00 | 17.366 | 41.2 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716205 |
2109 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9898468756077163
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.12251771563518532 R2 Score: 0.9894511723406865 RMSE: 0.350025 Entropy Value: 0.0026765662501610065
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.675523 |
| 1 | diabetes_prevalence | 0.292794 |
| 2 | male_smokers | 0.020230 |
| 3 | life_expectancy | 0.006730 |
| 5 | median_age | 0.003406 |
| 4 | aged_65_older | 0.001317 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Serbia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.980 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8377 | France | 1/25/2020 | 5.980 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8378 | France | 1/26/2020 | 5.980 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8379 | France | 1/27/2020 | 5.980 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8380 | France | 1/28/2020 | 5.980 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716205 |
2109 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9913712149726834
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.14086060776709658 R2 Score: 0.9878718414915131 RMSE: 0.375314 Entropy Value: 0.002701292210197016
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.943928 |
| 2 | extreme_poverty | 0.024987 |
| 5 | population | 0.011892 |
| 3 | gdp_per_capita | 0.010286 |
| 0 | hospital_beds_per_thousand | 0.007449 |
| 4 | population_density | 0.001457 |
# Country Pair by Pair Analysis relative to life expectancy
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on life expectancy (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]
df_UnitedStates = df[(df.location == "United States")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Latvia = df[(df.location == "Latvia")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 18.571 | 41.8 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 18.571 | 41.8 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 18.571 | 41.8 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 18.571 | 41.8 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 18.571 | 41.8 | 0.711787 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9969932164978618
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01827006714784305 R2 Score: 0.9984568901791381 RMSE: 0.135167 Entropy Value: 0.0007004294195467055
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.630882 |
| 0 | cardiovasc_death_rate | 0.309663 |
| 2 | female_smokers | 0.039951 |
| 5 | median_age | 0.015845 |
| 3 | male_smokers | 0.003381 |
| 4 | aged_65_older | 0.000277 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9978932925070829
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008856177569188209 R2 Score: 0.9992519975722188 RMSE: 0.094107 Entropy Value: 0.000433455898033473
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.927909 |
| 2 | extreme_poverty | 0.038889 |
| 5 | population | 0.030571 |
| 3 | gdp_per_capita | 0.002263 |
| 4 | population_density | 0.000344 |
| 0 | hospital_beds_per_thousand | 0.000024 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 16.984 | 41.4 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9982932314972184
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0032838118429310025 R2 Score: 0.9990345765129632 RMSE: 0.057305 Entropy Value: 0.00041277214242018204
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.853954 |
| 1 | diabetes_prevalence | 0.102526 |
| 2 | female_smokers | 0.020535 |
| 5 | median_age | 0.020360 |
| 3 | male_smokers | 0.002201 |
| 4 | aged_65_older | 0.000425 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Cyprus'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.4 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9989558784228938
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0032162020605789846 R2 Score: 0.9990544534349546 RMSE: 0.056712 Entropy Value: 0.000341364428693161
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.515688 |
| 1 | human_development_index | 0.452145 |
| 2 | extreme_poverty | 0.028238 |
| 3 | gdp_per_capita | 0.003655 |
| 4 | population_density | 0.000192 |
| 0 | hospital_beds_per_thousand | 0.000083 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 19.677 | 42.3 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 19.677 | 42.3 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 19.677 | 42.3 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 19.677 | 42.3 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 19.677 | 42.3 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 21.228 | 42.8 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 21.228 | 42.8 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 21.228 | 42.8 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 21.228 | 42.8 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 21.228 | 42.8 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989059824652742
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008510747064107163 R2 Score: 0.9951720209422329 RMSE: 0.092254 Entropy Value: 0.0017388182720325979
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.955920 |
| 2 | female_smokers | 0.028636 |
| 3 | male_smokers | 0.005947 |
| 5 | median_age | 0.005500 |
| 0 | cardiovasc_death_rate | 0.003652 |
| 4 | aged_65_older | 0.000344 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989156833724155
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008329520038950185 R2 Score: 0.995274827461516 RMSE: 0.091266 Entropy Value: 0.0016508456649447665
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.956574 |
| 2 | extreme_poverty | 0.028531 |
| 5 | population | 0.008398 |
| 3 | gdp_per_capita | 0.006081 |
| 4 | population_density | 0.000371 |
| 0 | hospital_beds_per_thousand | 0.000044 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 19.718 | 42.0 | 0.00000 |
| 8377 | France | 1/25/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 19.718 | 42.0 | 0.00000 |
| 8378 | France | 1/26/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 19.718 | 42.0 | 0.00000 |
| 8379 | France | 1/27/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 19.718 | 42.0 | 0.00000 |
| 8380 | France | 1/28/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 19.718 | 42.0 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 14.431 | 37.3 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9921689117203274
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.08132098431951777 R2 Score: 0.9935334250489188 RMSE: 0.285168 Entropy Value: 0.00169048811690724
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.688876 |
| 0 | cardiovasc_death_rate | 0.269106 |
| 5 | median_age | 0.026844 |
| 2 | female_smokers | 0.009094 |
| 3 | male_smokers | 0.005258 |
| 4 | aged_65_older | 0.000822 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9906321989356222
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.1263773231369439 R2 Score: 0.9899505836160186 RMSE: 0.355496 Entropy Value: 0.0032779127594959943
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.958053 |
| 2 | extreme_poverty | 0.017445 |
| 5 | population | 0.014344 |
| 3 | gdp_per_capita | 0.005157 |
| 0 | hospital_beds_per_thousand | 0.004252 |
| 4 | population_density | 0.000749 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 23.021 | 47.9 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998640466858235
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012181853469582376 R2 Score: 0.9989992331498202 RMSE: 0.110371 Entropy Value: 0.00035999671659794484
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.823708 |
| 1 | diabetes_prevalence | 0.127188 |
| 0 | cardiovasc_death_rate | 0.028491 |
| 2 | female_smokers | 0.017857 |
| 3 | male_smokers | 0.002469 |
| 4 | aged_65_older | 0.000286 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986413470709365
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013803653998387413 R2 Score: 0.9988659985635657 RMSE: 0.117489 Entropy Value: 0.0004399732479586434
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.937618 |
| 2 | extreme_poverty | 0.027005 |
| 5 | population | 0.023078 |
| 0 | hospital_beds_per_thousand | 0.008608 |
| 3 | gdp_per_capita | 0.002772 |
| 4 | population_density | 0.000918 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 14.312 | 39.7 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985636639377201
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01177642506262874 R2 Score: 0.9984651456288152 RMSE: 0.108519 Entropy Value: 0.0011725037332187857
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.792799 |
| 0 | cardiovasc_death_rate | 0.162571 |
| 2 | female_smokers | 0.023739 |
| 5 | median_age | 0.015139 |
| 3 | male_smokers | 0.004875 |
| 4 | aged_65_older | 0.000876 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.377872 |
2078 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985273943931006
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007512088981123421 R2 Score: 0.9990209284610492 RMSE: 0.086672 Entropy Value: 0.0009194530556352238
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.943109 |
| 2 | extreme_poverty | 0.037435 |
| 5 | population | 0.012103 |
| 3 | gdp_per_capita | 0.006234 |
| 0 | hospital_beds_per_thousand | 0.000694 |
| 4 | population_density | 0.000425 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 19.062 | 44.5 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983664740108746
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00327553495146727 R2 Score: 0.9983603374909553 RMSE: 0.057232 Entropy Value: 0.0004816810301686089
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.930766 |
| 0 | cardiovasc_death_rate | 0.038511 |
| 2 | female_smokers | 0.027558 |
| 5 | median_age | 0.001745 |
| 3 | male_smokers | 0.001122 |
| 4 | aged_65_older | 0.000299 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.998167144839121
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0038490153568325174 R2 Score: 0.9980732655059875 RMSE: 0.062040 Entropy Value: 0.000510878761187502
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.964288 |
| 2 | extreme_poverty | 0.029185 |
| 5 | population | 0.004776 |
| 3 | gdp_per_capita | 0.001256 |
| 4 | population_density | 0.000316 |
| 0 | hospital_beds_per_thousand | 0.000179 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Sweden'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 19.985 | 41.0 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 19.985 | 41.0 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 19.985 | 41.0 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 19.985 | 41.0 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 19.985 | 41.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 19.436 | 45.5 | 0.855148 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984490067368526
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.014451320867759824 R2 Score: 0.9983049038248775 RMSE: 0.120214 Entropy Value: 0.0005419190094358684
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.971991 |
| 2 | female_smokers | 0.021755 |
| 5 | median_age | 0.002900 |
| 3 | male_smokers | 0.002066 |
| 0 | cardiovasc_death_rate | 0.001038 |
| 4 | aged_65_older | 0.000250 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Sweden'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2126 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985661804087467
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01796724914647103 R2 Score: 0.9978924960850047 RMSE: 0.134042 Entropy Value: 0.0006245047380715956
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.972786 |
| 2 | extreme_poverty | 0.022063 |
| 5 | population | 0.002947 |
| 3 | gdp_per_capita | 0.001796 |
| 4 | population_density | 0.000333 |
| 0 | hospital_beds_per_thousand | 0.000074 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 18.436 | 43.1 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 18.436 | 43.1 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 18.436 | 43.1 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 18.436 | 43.1 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 18.436 | 43.1 | 0.322149 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.96135172234504
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.28543037523612425 R2 Score: 0.988729253100176 RMSE: 0.534257 Entropy Value: 0.0037396223964769847
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.823701 |
| 5 | median_age | 0.064555 |
| 1 | diabetes_prevalence | 0.055837 |
| 2 | female_smokers | 0.026659 |
| 3 | male_smokers | 0.019169 |
| 4 | aged_65_older | 0.010079 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322149 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.958329239005943
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.5800890923694779 R2 Score: 0.9770941080323481 RMSE: 0.761636 Entropy Value: 0.004716031800673148
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.832172 |
| 5 | population | 0.070924 |
| 2 | extreme_poverty | 0.041400 |
| 3 | gdp_per_capita | 0.035553 |
| 4 | population_density | 0.019898 |
| 0 | hospital_beds_per_thousand | 0.000053 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Estonia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 19.027 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 19.027 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 19.027 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 19.027 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 19.027 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 19.452 | 42.7 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 19.452 | 42.7 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 19.452 | 42.7 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 19.452 | 42.7 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 19.452 | 42.7 | 0.466423 |
2095 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998089086726092
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0007961105439895268 R2 Score: 0.9988440348594501 RMSE: 0.028215 Entropy Value: 0.0002548819536094143
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.891714 |
| 2 | female_smokers | 0.048162 |
| 0 | cardiovasc_death_rate | 0.047789 |
| 3 | male_smokers | 0.008997 |
| 5 | median_age | 0.002963 |
| 4 | aged_65_older | 0.000374 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Estonia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.63 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
2095 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9968447535278674
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0019107375406635276 R2 Score: 0.9972255787761853 RMSE: 0.043712 Entropy Value: 0.0005664584928279815
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.927005 |
| 2 | extreme_poverty | 0.053546 |
| 3 | gdp_per_capita | 0.010799 |
| 5 | population | 0.005969 |
| 0 | hospital_beds_per_thousand | 0.002133 |
| 4 | population_density | 0.000548 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Bulgaria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 15.413 | 38.3 | 1.084791 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.949358596400903
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.04053191186316277 R2 Score: 0.9796126270982285 RMSE: 0.201325 Entropy Value: 0.0013052064555693329
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.746023 |
| 1 | diabetes_prevalence | 0.151873 |
| 2 | female_smokers | 0.040737 |
| 3 | male_smokers | 0.022668 |
| 5 | median_age | 0.020063 |
| 4 | aged_65_older | 0.018636 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United States'
country2 = 'Bulgaria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.770 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.770 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.770 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.770 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.770 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9463618954147719
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.021173310142172767 R2 Score: 0.9893499183830592 RMSE: 0.145511 Entropy Value: 0.0007566541098966247
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.651323 |
| 5 | population | 0.119828 |
| 0 | hospital_beds_per_thousand | 0.104203 |
| 2 | extreme_poverty | 0.062823 |
| 3 | gdp_per_capita | 0.031452 |
| 4 | population_density | 0.030370 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Romania'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 17.850 | 43.0 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 17.850 | 43.0 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 17.850 | 43.0 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 17.850 | 43.0 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 17.850 | 43.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 19.754 | 43.9 | 0.631969 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.997239476634585
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004065478119768843 R2 Score: 0.9972309144609833 RMSE: 0.063761 Entropy Value: 0.0002428106325496918
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.852131 |
| 5 | median_age | 0.077616 |
| 1 | diabetes_prevalence | 0.048967 |
| 2 | female_smokers | 0.014654 |
| 3 | male_smokers | 0.006080 |
| 4 | aged_65_older | 0.000552 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Romania'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9961638218621977
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007175888410943785 R2 Score: 0.9951123463851107 RMSE: 0.084711 Entropy Value: 0.0005117984164720532
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.697452 |
| 5 | population | 0.195946 |
| 0 | hospital_beds_per_thousand | 0.073939 |
| 2 | extreme_poverty | 0.019689 |
| 3 | gdp_per_capita | 0.012548 |
| 4 | population_density | 0.000427 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 17.366 | 41.2 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 17.366 | 41.2 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 17.366 | 41.2 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 17.366 | 41.2 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 17.366 | 41.2 | 0.716205 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9926858700401283
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0007922951938540062 R2 Score: 0.9967575480800499 RMSE: 0.028148 Entropy Value: 0.00046542512044971915
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.871068 |
| 5 | median_age | 0.063050 |
| 0 | cardiovasc_death_rate | 0.030402 |
| 2 | female_smokers | 0.030107 |
| 3 | male_smokers | 0.003771 |
| 4 | aged_65_older | 0.001603 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716205 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9928368516844188
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0006865654195289836 R2 Score: 0.9971902450248445 RMSE: 0.026202 Entropy Value: 0.0004341579931648298
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.877517 |
| 5 | population | 0.080276 |
| 2 | extreme_poverty | 0.031016 |
| 0 | hospital_beds_per_thousand | 0.005715 |
| 3 | gdp_per_capita | 0.003793 |
| 4 | population_density | 0.001682 |
# Country Pair by Pair Analysis relative to aged_65_older
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on aged_65_older (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Finland = df[(df.location == "Finland")]
df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]
df_Sweden = df[(df.location == "Sweden")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]
df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]
df_Latvia = df[(df.location == "Latvia")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Slovakia = df[(df.location == "Slovakia")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Finland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 42.8 | 0.551590 |
| 8372 | Finland | 12/26/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 42.8 | 0.551590 |
| 8373 | Finland | 12/27/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 42.8 | 0.551590 |
| 8374 | Finland | 12/28/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 42.8 | 0.551590 |
| 8375 | Finland | 12/29/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 42.8 | 0.551590 |
2093 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9591779911867772
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009184792684689342 R2 Score: 0.996290335239657 RMSE: 0.095837 Entropy Value: 0.001116882714947321
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.836592 |
| 5 | median_age | 0.056852 |
| 1 | diabetes_prevalence | 0.052026 |
| 2 | female_smokers | 0.030780 |
| 3 | male_smokers | 0.018852 |
| 4 | life_expectancy | 0.004898 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Finland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.50 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.280 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.551590 |
| 8372 | Finland | 12/26/2022 | 3.280 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.551590 |
| 8373 | Finland | 12/27/2022 | 3.280 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.551590 |
| 8374 | Finland | 12/28/2022 | 3.280 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.551590 |
| 8375 | Finland | 12/29/2022 | 3.280 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.551590 |
2093 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9598292232562027
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007393669469321995 R2 Score: 0.9970137556696637 RMSE: 0.085986 Entropy Value: 0.0009964875018581088
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.564647 |
| 1 | human_development_index | 0.222710 |
| 0 | hospital_beds_per_thousand | 0.148938 |
| 2 | extreme_poverty | 0.034093 |
| 3 | gdp_per_capita | 0.023922 |
| 4 | population_density | 0.005690 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Portugal'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 47.9 | 0.735109 |
2098 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9991914852672478
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.04174502844368979 R2 Score: 0.996335689684427 RMSE: 0.204316 Entropy Value: 0.0018627830616119736
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.917329 |
| 1 | diabetes_prevalence | 0.052036 |
| 2 | female_smokers | 0.025708 |
| 5 | median_age | 0.002549 |
| 3 | male_smokers | 0.002091 |
| 4 | life_expectancy | 0.000287 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Portugal'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2098 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9991962488532368
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0346698323414571 R2 Score: 0.996956738825574 RMSE: 0.186198 Entropy Value: 0.0012806717112898633
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.923655 |
| 5 | population | 0.047005 |
| 2 | extreme_poverty | 0.026004 |
| 3 | gdp_per_capita | 0.003020 |
| 4 | population_density | 0.000296 |
| 0 | hospital_beds_per_thousand | 0.000020 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Austria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 41.0 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9957633295820632
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.02144933089134254 R2 Score: 0.9954134974658286 RMSE: 0.146456 Entropy Value: 0.0008648401202367788
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.634096 |
| 1 | diabetes_prevalence | 0.222993 |
| 5 | median_age | 0.121456 |
| 2 | female_smokers | 0.016268 |
| 3 | male_smokers | 0.003877 |
| 4 | life_expectancy | 0.001309 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Austria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9959755632813897
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.02494529961192575 R2 Score: 0.994665955760329 RMSE: 0.157941 Entropy Value: 0.0010168948933280927
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.948503 |
| 2 | extreme_poverty | 0.020497 |
| 5 | population | 0.019786 |
| 0 | hospital_beds_per_thousand | 0.006435 |
| 3 | gdp_per_capita | 0.003427 |
| 4 | population_density | 0.001354 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 41.4 | 1.093162 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974429094620534
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.034222729995244096 R2 Score: 0.9974121725749812 RMSE: 0.184994 Entropy Value: 0.0007442340019509717
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.900768 |
| 0 | cardiovasc_death_rate | 0.067474 |
| 2 | female_smokers | 0.024148 |
| 5 | median_age | 0.004890 |
| 3 | male_smokers | 0.002331 |
| 4 | life_expectancy | 0.000390 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975469171045417
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.03171734845754228 R2 Score: 0.9976016225415473 RMSE: 0.178094 Entropy Value: 0.0007572895754231443
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.965414 |
| 2 | extreme_poverty | 0.021985 |
| 5 | population | 0.007566 |
| 0 | hospital_beds_per_thousand | 0.002465 |
| 3 | gdp_per_capita | 0.001925 |
| 4 | population_density | 0.000644 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 42.3 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 42.3 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 42.3 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 42.3 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 42.3 | 0.229131 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998749456645679
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00293085842141847 R2 Score: 0.99756672605 RMSE: 0.054137 Entropy Value: 0.0006772694785891467
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.954483 |
| 2 | female_smokers | 0.023955 |
| 0 | cardiovasc_death_rate | 0.016581 |
| 5 | median_age | 0.003026 |
| 3 | male_smokers | 0.001699 |
| 4 | life_expectancy | 0.000256 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.229131 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984672041343956
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0024692245202902934 R2 Score: 0.9979499863732701 RMSE: 0.049691 Entropy Value: 0.0005994641511856433
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.967631 |
| 2 | extreme_poverty | 0.025845 |
| 5 | population | 0.003256 |
| 3 | gdp_per_capita | 0.001799 |
| 0 | hospital_beds_per_thousand | 0.001150 |
| 4 | population_density | 0.000319 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 42.7 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 42.7 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 42.7 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 42.7 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 42.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 42.0 | 0.411892 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9950083298202264
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.09514590931040419 R2 Score: 0.9900939715866824 RMSE: 0.308457 Entropy Value: 0.005407421439201657
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.789072 |
| 0 | cardiovasc_death_rate | 0.159897 |
| 5 | median_age | 0.031527 |
| 2 | female_smokers | 0.015592 |
| 3 | male_smokers | 0.003481 |
| 4 | life_expectancy | 0.000430 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411892 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9924204242316919
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.09564363505394957 R2 Score: 0.9900421513308949 RMSE: 0.309263 Entropy Value: 0.005800126883421082
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.929469 |
| 5 | population | 0.033558 |
| 2 | extreme_poverty | 0.029972 |
| 3 | gdp_per_capita | 0.003907 |
| 0 | hospital_beds_per_thousand | 0.002285 |
| 4 | population_density | 0.000808 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Netherlands'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 43.9 | 0.631969 |
2075 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985833133413833
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008546819092992161 R2 Score: 0.9988517930067208 RMSE: 0.092449 Entropy Value: 0.00036524704725572567
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.679384 |
| 0 | cardiovasc_death_rate | 0.274804 |
| 2 | female_smokers | 0.036255 |
| 5 | median_age | 0.005677 |
| 3 | male_smokers | 0.003251 |
| 4 | life_expectancy | 0.000629 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Netherlands'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2075 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9982718550442117
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012318889250834996 R2 Score: 0.9983450410458743 RMSE: 0.110990 Entropy Value: 0.0005681485654265903
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.946276 |
| 2 | extreme_poverty | 0.036129 |
| 5 | population | 0.011335 |
| 3 | gdp_per_capita | 0.003979 |
| 0 | hospital_beds_per_thousand | 0.001546 |
| 4 | population_density | 0.000735 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 43.0 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9957145121672465
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0035860097157909237 R2 Score: 0.9979121949707439 RMSE: 0.059883 Entropy Value: 0.00028292645399796484
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.414324 |
| 0 | cardiovasc_death_rate | 0.413998 |
| 1 | diabetes_prevalence | 0.146518 |
| 2 | female_smokers | 0.021706 |
| 3 | male_smokers | 0.002908 |
| 4 | life_expectancy | 0.000546 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9942151184753699
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003107698404524249 R2 Score: 0.9981906718406797 RMSE: 0.055747 Entropy Value: 0.0004506715934095217
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.753233 |
| 1 | human_development_index | 0.216832 |
| 2 | extreme_poverty | 0.021982 |
| 3 | gdp_per_capita | 0.007136 |
| 4 | population_density | 0.000814 |
| 0 | hospital_beds_per_thousand | 0.000003 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Spain'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 45.5 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 45.5 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 45.5 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 45.5 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 45.5 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 44.5 | 0.536669 |
2125 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985825851316085
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004655331865096316 R2 Score: 0.999286023415865 RMSE: 0.068230 Entropy Value: 0.00032059930777523355
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.897896 |
| 0 | cardiovasc_death_rate | 0.073585 |
| 2 | female_smokers | 0.020414 |
| 3 | male_smokers | 0.004171 |
| 5 | median_age | 0.003642 |
| 4 | life_expectancy | 0.000293 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Spain'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 2.97 | 0.904 | 1.0 | 34272.36 | 93.105 | 47558632 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 2.97 | 0.904 | 1.0 | 34272.36 | 93.105 | 47558632 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 2.97 | 0.904 | 1.0 | 34272.36 | 93.105 | 47558632 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 2.97 | 0.904 | 1.0 | 34272.36 | 93.105 | 47558632 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 2.97 | 0.904 | 1.0 | 34272.36 | 93.105 | 47558632 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.84 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.84 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.84 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.84 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.84 | 102.619 | 2119843 | 0.536669 |
2125 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983652038818789
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006147021306110068 R2 Score: 0.9990572467437505 RMSE: 0.078403 Entropy Value: 0.00037964640931083954
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.956053 |
| 2 | extreme_poverty | 0.023320 |
| 5 | population | 0.012961 |
| 0 | hospital_beds_per_thousand | 0.004916 |
| 3 | gdp_per_capita | 0.002334 |
| 4 | population_density | 0.000416 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 43.1 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 43.1 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 43.1 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 43.1 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 43.1 | 0.322149 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Best CV score: 0.9600280576636526
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.7087942445483512 R2 Score: 0.9720119467742431 RMSE: 0.841899 Entropy Value: 0.005512737732013974
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.816569 |
| 1 | diabetes_prevalence | 0.048619 |
| 5 | median_age | 0.047136 |
| 2 | female_smokers | 0.042191 |
| 3 | male_smokers | 0.032730 |
| 4 | life_expectancy | 0.012756 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.322149 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.958329239005943
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.5800890923694779 R2 Score: 0.9770941080323481 RMSE: 0.761636 Entropy Value: 0.004716031800673148
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.832172 |
| 5 | population | 0.070924 |
| 2 | extreme_poverty | 0.041400 |
| 3 | gdp_per_capita | 0.035553 |
| 4 | population_density | 0.019898 |
| 0 | hospital_beds_per_thousand | 0.000053 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 37.3 | 0.00000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 37.3 | 0.00000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 37.3 | 0.00000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 37.3 | 0.00000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 37.3 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 37.3 | 0.11011 |
2063 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9764230567132627
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0006385841997698732 R2 Score: 0.99674427235998 RMSE: 0.025270 Entropy Value: 0.0005132911833312269
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.522674 |
| 0 | cardiovasc_death_rate | 0.418221 |
| 2 | female_smokers | 0.031807 |
| 5 | median_age | 0.012626 |
| 3 | male_smokers | 0.007793 |
| 4 | life_expectancy | 0.006879 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Iceland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2063 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9766546149550385
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0006872215957745136 R2 Score: 0.9964963017484805 RMSE: 0.026215 Entropy Value: 0.0005144780719953724
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.918759 |
| 2 | extreme_poverty | 0.029885 |
| 5 | population | 0.021024 |
| 0 | hospital_beds_per_thousand | 0.013524 |
| 3 | gdp_per_capita | 0.010039 |
| 4 | population_density | 0.006769 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 38.7 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 38.7 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 38.7 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 38.7 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 38.7 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9979483134774702
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0024530560615069748 R2 Score: 0.9989257488789922 RMSE: 0.049528 Entropy Value: 0.0004755467309312025
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.943395 |
| 2 | female_smokers | 0.030175 |
| 1 | diabetes_prevalence | 0.018569 |
| 5 | median_age | 0.005552 |
| 3 | male_smokers | 0.001946 |
| 4 | life_expectancy | 0.000363 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998351364576265
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001882911265242772 R2 Score: 0.9991754287359407 RMSE: 0.043393 Entropy Value: 0.0004605212242426924
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.523426 |
| 1 | human_development_index | 0.437981 |
| 2 | extreme_poverty | 0.036239 |
| 3 | gdp_per_capita | 0.001743 |
| 4 | population_density | 0.000594 |
| 0 | hospital_beds_per_thousand | 0.000017 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 38.3 | 1.084791 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9737847333872919
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.016014923250739586 R2 Score: 0.984946595950419 RMSE: 0.126550 Entropy Value: 0.001487608489672066
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.540912 |
| 0 | cardiovasc_death_rate | 0.306352 |
| 5 | median_age | 0.082899 |
| 2 | female_smokers | 0.042828 |
| 3 | male_smokers | 0.019670 |
| 4 | life_expectancy | 0.007340 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'United States'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9745517168509374
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01914917351005437 R2 Score: 0.9820005227905748 RMSE: 0.138381 Entropy Value: 0.0017834873721826897
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.738239 |
| 5 | population | 0.187593 |
| 2 | extreme_poverty | 0.045517 |
| 3 | gdp_per_capita | 0.018252 |
| 4 | population_density | 0.010388 |
| 0 | hospital_beds_per_thousand | 0.000009 |
# Country Pair by Pair Analysis relative to diabetes prevalence
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on diabetes prevalence (13 pairs of countries)
df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]
df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Austria = df[(df.location == "Austria")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Portugal = df[(df.location == "Portugal")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 255.569 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 255.569 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 255.569 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 255.569 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 255.569 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
2121 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986082260800637
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.015400807395380635 R2 Score: 0.9987798823156795 RMSE: 0.124100 Entropy Value: 0.0006057917033947405
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.721427 |
| 0 | cardiovasc_death_rate | 0.231062 |
| 2 | male_smokers | 0.038212 |
| 5 | median_age | 0.006339 |
| 3 | life_expectancy | 0.002593 |
| 4 | aged_65_older | 0.000368 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 4.69 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
2121 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984095979233917
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.016179009133714634 R2 Score: 0.9987182298530174 RMSE: 0.127197 Entropy Value: 0.0006399632348900017
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.932614 |
| 2 | extreme_poverty | 0.038249 |
| 5 | population | 0.019770 |
| 0 | hospital_beds_per_thousand | 0.005374 |
| 3 | gdp_per_capita | 0.003504 |
| 4 | population_density | 0.000490 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.060 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8377 | France | 1/25/2020 | 86.060 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8378 | France | 1/26/2020 | 86.060 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8379 | France | 1/27/2020 | 86.060 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| 8380 | France | 1/28/2020 | 86.060 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.990388976505925
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.1330022202706 R2 Score: 0.9894237774759248 RMSE: 0.364695 Entropy Value: 0.0029852162511977524
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.704205 |
| 0 | cardiovasc_death_rate | 0.270532 |
| 2 | male_smokers | 0.016687 |
| 3 | life_expectancy | 0.005344 |
| 5 | median_age | 0.002318 |
| 4 | aged_65_older | 0.000914 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Iceland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2107 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9906321989356222
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.1263773231369439 R2 Score: 0.9899505836160186 RMSE: 0.355496 Entropy Value: 0.0032779127594959943
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.958053 |
| 2 | extreme_poverty | 0.017445 |
| 5 | population | 0.014344 |
| 3 | gdp_per_capita | 0.005157 |
| 0 | hospital_beds_per_thousand | 0.004252 |
| 4 | population_density | 0.000749 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9980680064496482
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.019286091351583046 R2 Score: 0.9984156039191903 RMSE: 0.138874 Entropy Value: 0.0005198299298636055
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.837218 |
| 1 | female_smokers | 0.131195 |
| 2 | male_smokers | 0.026871 |
| 3 | life_expectancy | 0.002415 |
| 5 | median_age | 0.001822 |
| 4 | aged_65_older | 0.000480 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2099 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986413470709365
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013803653998387413 R2 Score: 0.9988659985635657 RMSE: 0.117489 Entropy Value: 0.0004399732479586434
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.937618 |
| 2 | extreme_poverty | 0.027005 |
| 5 | population | 0.023078 |
| 0 | hospital_beds_per_thousand | 0.008608 |
| 3 | gdp_per_capita | 0.002772 |
| 4 | population_density | 0.000918 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Luxembourg'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2079 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9972959686205798
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0032690908969740737 R2 Score: 0.9917683753526738 RMSE: 0.057176 Entropy Value: 0.0011809055355708035
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.916922 |
| 2 | male_smokers | 0.032989 |
| 0 | cardiovasc_death_rate | 0.029222 |
| 5 | median_age | 0.016208 |
| 3 | life_expectancy | 0.004059 |
| 4 | aged_65_older | 0.000599 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Luxembourg'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2079 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.997384664409495
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0031257920224898803 R2 Score: 0.9921292042755496 RMSE: 0.055909 Entropy Value: 0.0010800758453721978
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.919383 |
| 5 | population | 0.042867 |
| 2 | extreme_poverty | 0.034274 |
| 3 | gdp_per_capita | 0.002756 |
| 4 | population_density | 0.000677 |
| 0 | hospital_beds_per_thousand | 0.000044 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984774091696321
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010199687742194393 R2 Score: 0.9990245151847958 RMSE: 0.100994 Entropy Value: 0.00037837583326060144
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.973528 |
| 2 | male_smokers | 0.023249 |
| 3 | life_expectancy | 0.001411 |
| 0 | cardiovasc_death_rate | 0.000862 |
| 5 | median_age | 0.000565 |
| 4 | aged_65_older | 0.000385 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2100 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984750138014478
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010845809623248557 R2 Score: 0.9989627209319059 RMSE: 0.104143 Entropy Value: 0.0004003501440794584
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.973698 |
| 2 | extreme_poverty | 0.023524 |
| 3 | gdp_per_capita | 0.001460 |
| 5 | population | 0.000883 |
| 4 | population_density | 0.000409 |
| 0 | hospital_beds_per_thousand | 0.000027 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Austria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 122.137 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 122.137 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 122.137 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 122.137 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 122.137 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.9762513886260628
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 8.726735777741874 R2 Score: 0.7733967613328221 RMSE: 2.954105 Entropy Value: 0.027561275886581353
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | median_age | 0.949010 |
| 1 | female_smokers | 0.028432 |
| 2 | male_smokers | 0.008241 |
| 0 | cardiovasc_death_rate | 0.005321 |
| 3 | life_expectancy | 0.004751 |
| 4 | aged_65_older | 0.004245 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Austria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 0.883564 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9836112086552532
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 6.845097630533553 R2 Score: 0.8222564161930779 RMSE: 2.616314 Entropy Value: 0.018393087069933763
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.898081 |
| 2 | extreme_poverty | 0.045096 |
| 5 | population | 0.034537 |
| 4 | population_density | 0.014076 |
| 3 | gdp_per_capita | 0.007620 |
| 0 | hospital_beds_per_thousand | 0.000590 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Czechia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919575 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9464328946652518
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002615323739285408 R2 Score: 0.9984048531108621 RMSE: 0.051140 Entropy Value: 0.0003186984164555006
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.829005 |
| 5 | median_age | 0.070605 |
| 2 | male_smokers | 0.032670 |
| 3 | life_expectancy | 0.031136 |
| 1 | female_smokers | 0.024700 |
| 4 | aged_65_older | 0.011884 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Czechia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.630 | 0.900 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.919575 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9509416306303644
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003709674750284998 R2 Score: 0.9977373829294084 RMSE: 0.060907 Entropy Value: 0.0006342010482872406
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.598261 |
| 0 | hospital_beds_per_thousand | 0.292702 |
| 2 | extreme_poverty | 0.046316 |
| 3 | gdp_per_capita | 0.031321 |
| 1 | human_development_index | 0.026994 |
| 4 | population_density | 0.004406 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 153.507 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 153.507 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 153.507 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 153.507 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989025431448132
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008557398987995634 R2 Score: 0.9951455562253472 RMSE: 0.092506 Entropy Value: 0.001751932122670549
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.955989 |
| 2 | male_smokers | 0.028413 |
| 3 | life_expectancy | 0.006226 |
| 5 | median_age | 0.005439 |
| 0 | cardiovasc_death_rate | 0.003622 |
| 4 | aged_65_older | 0.000312 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Denmark'
country2 = 'Finland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5188 | Denmark | 2/3/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5189 | Denmark | 2/4/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5190 | Denmark | 2/5/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| 5191 | Denmark | 2/6/2020 | 2.50 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
2128 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989156833724155
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008329520038950185 R2 Score: 0.995274827461516 RMSE: 0.091266 Entropy Value: 0.0016508456649447665
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.956574 |
| 2 | extreme_poverty | 0.028531 |
| 5 | population | 0.008398 |
| 3 | gdp_per_capita | 0.006081 |
| 4 | population_density | 0.000371 |
| 0 | hospital_beds_per_thousand | 0.000044 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Canada'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2111 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.999255499783543
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0057175049217638016 R2 Score: 0.9982582431748896 RMSE: 0.075614 Entropy Value: 0.00090492426720773
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.853151 |
| 0 | cardiovasc_death_rate | 0.096198 |
| 5 | median_age | 0.024493 |
| 2 | male_smokers | 0.024408 |
| 3 | life_expectancy | 0.001513 |
| 4 | aged_65_older | 0.000235 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Canada'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.50 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.50 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.50 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.50 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.50 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2111 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9990269951754802
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007219604675773637 R2 Score: 0.9978006497780595 RMSE: 0.084968 Entropy Value: 0.0013001589537891883
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.940026 |
| 5 | population | 0.034436 |
| 2 | extreme_poverty | 0.023629 |
| 3 | gdp_per_capita | 0.001447 |
| 4 | population_density | 0.000316 |
| 0 | hospital_beds_per_thousand | 0.000146 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Portugal'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 127.842 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 127.842 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 127.842 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 127.842 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 127.842 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9954906642741526
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.000776118438549149 R2 Score: 0.9992061430700794 RMSE: 0.027859 Entropy Value: inf
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.627077 |
| 0 | cardiovasc_death_rate | 0.278172 |
| 5 | median_age | 0.066527 |
| 2 | male_smokers | 0.024780 |
| 3 | life_expectancy | 0.002661 |
| 4 | aged_65_older | 0.000783 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Portugal'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 112.371 | 10270857 | 0.462977 |
2061 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9959306509224998
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0011340569012311748 R2 Score: 0.9988400237834194 RMSE: 0.033676 Entropy Value: inf
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.774123 |
| 5 | population | 0.191199 |
| 2 | extreme_poverty | 0.031279 |
| 3 | gdp_per_capita | 0.002127 |
| 4 | population_density | 0.001223 |
| 0 | hospital_beds_per_thousand | 0.000049 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9966830298813072
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002114226351778038 R2 Score: 0.9987690796288725 RMSE: 0.045981 Entropy Value: 0.00023982543523329205
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.735038 |
| 1 | female_smokers | 0.152939 |
| 5 | median_age | 0.086766 |
| 2 | male_smokers | 0.022275 |
| 3 | life_expectancy | 0.002292 |
| 4 | aged_65_older | 0.000690 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9942151184753699
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003107698404524249 R2 Score: 0.9981906718406797 RMSE: 0.055747 Entropy Value: 0.0004506715934095217
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.753233 |
| 1 | human_development_index | 0.216832 |
| 2 | extreme_poverty | 0.021982 |
| 3 | gdp_per_capita | 0.007136 |
| 4 | population_density | 0.000814 |
| 0 | hospital_beds_per_thousand | 0.000003 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974452198310478
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005940423882190219 R2 Score: 0.9970855058116366 RMSE: 0.077074 Entropy Value: 0.0007644065172632581
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.641126 |
| 0 | cardiovasc_death_rate | 0.314675 |
| 2 | male_smokers | 0.022687 |
| 5 | median_age | 0.017803 |
| 3 | life_expectancy | 0.003272 |
| 4 | aged_65_older | 0.000437 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9967388889181926
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010420033179699867 R2 Score: 0.9948877173166316 RMSE: 0.102079 Entropy Value: 0.0011192503090849918
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.891315 |
| 5 | population | 0.082122 |
| 2 | extreme_poverty | 0.022988 |
| 3 | gdp_per_capita | 0.002865 |
| 4 | population_density | 0.000641 |
| 0 | hospital_beds_per_thousand | 0.000069 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 99.403 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 99.403 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 99.403 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 99.403 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 99.403 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9979863431205297
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.07209044944386439 R2 Score: 0.9878655952342997 RMSE: 0.268497 Entropy Value: 0.002601145529676064
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | female_smokers | 0.915632 |
| 0 | cardiovasc_death_rate | 0.053216 |
| 2 | male_smokers | 0.026576 |
| 3 | life_expectancy | 0.002459 |
| 5 | median_age | 0.001546 |
| 4 | aged_65_older | 0.000572 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'United States'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9974172303447238
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.069094695635595 R2 Score: 0.9883698463461787 RMSE: 0.262859 Entropy Value: 0.002484362767590676
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.954547 |
| 2 | extreme_poverty | 0.026867 |
| 5 | population | 0.012769 |
| 3 | gdp_per_capita | 0.003674 |
| 0 | hospital_beds_per_thousand | 0.001522 |
| 4 | population_density | 0.000621 |
# Country Pair by Pair Analysis relative to median age
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on median age (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]
df_Estonia = df[(df.location == "Estonia")]
df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]
df_Latvia = df[(df.location == "Latvia")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Italy'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 0.735109 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9897461070578887
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.014757882118008693 R2 Score: 0.9986128106037364 RMSE: 0.121482 Entropy Value: 0.000395927702763076
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.529929 |
| 5 | aged_65_older | 0.436215 |
| 2 | female_smokers | 0.025303 |
| 0 | cardiovasc_death_rate | 0.004177 |
| 3 | male_smokers | 0.002378 |
| 4 | life_expectancy | 0.001999 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Italy'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.180 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.180 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.180 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.180 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.180 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2091 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9908303413224842
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.02496651758267187 R2 Score: 0.9976532345105231 RMSE: 0.158008 Entropy Value: 0.0005803326097664843
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.873019 |
| 5 | population | 0.088615 |
| 2 | extreme_poverty | 0.032714 |
| 4 | population_density | 0.003001 |
| 3 | gdp_per_capita | 0.002228 |
| 0 | hospital_beds_per_thousand | 0.000424 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983721604192913
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0030370867428418185 R2 Score: 0.998479699547482 RMSE: 0.055110 Entropy Value: 0.0004092010063048637
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.936569 |
| 0 | cardiovasc_death_rate | 0.030037 |
| 2 | female_smokers | 0.027445 |
| 5 | aged_65_older | 0.004581 |
| 3 | male_smokers | 0.001084 |
| 4 | life_expectancy | 0.000284 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovenia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.998167144839121
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0038490153568325174 R2 Score: 0.9980732655059875 RMSE: 0.062040 Entropy Value: 0.000510878761187502
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.964288 |
| 2 | extreme_poverty | 0.029185 |
| 5 | population | 0.004776 |
| 3 | gdp_per_capita | 0.001256 |
| 4 | population_density | 0.000316 |
| 0 | hospital_beds_per_thousand | 0.000179 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Austria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 0.855148 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9985662954702006
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004783382509163117 R2 Score: 0.9991495580176508 RMSE: 0.069162 Entropy Value: 0.00032436366709763954
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.505889 |
| 1 | diabetes_prevalence | 0.432103 |
| 2 | female_smokers | 0.042053 |
| 5 | aged_65_older | 0.016378 |
| 3 | male_smokers | 0.003240 |
| 4 | life_expectancy | 0.000337 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Austria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.97 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975828020530777
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005858624459701179 R2 Score: 0.9989583897608432 RMSE: 0.076542 Entropy Value: 0.00046284689785761193
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.926755 |
| 2 | extreme_poverty | 0.038959 |
| 5 | population | 0.029970 |
| 3 | gdp_per_capita | 0.003855 |
| 4 | population_density | 0.000348 |
| 0 | hospital_beds_per_thousand | 0.000113 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 1.093162 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.997355164620372
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.030061431635492165 R2 Score: 0.997726838354729 RMSE: 0.173382 Entropy Value: 0.0006262375647953363
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.905479 |
| 0 | cardiovasc_death_rate | 0.059714 |
| 2 | female_smokers | 0.024466 |
| 5 | aged_65_older | 0.007487 |
| 3 | male_smokers | 0.002440 |
| 4 | life_expectancy | 0.000414 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Canada'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.50 | 0.929 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2132 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975469171045417
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.03171734845754228 R2 Score: 0.9976016225415473 RMSE: 0.178094 Entropy Value: 0.0007572895754231443
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.965414 |
| 2 | extreme_poverty | 0.021985 |
| 5 | population | 0.007566 |
| 0 | hospital_beds_per_thousand | 0.002465 |
| 3 | gdp_per_capita | 0.001925 |
| 4 | population_density | 0.000644 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 0.229131 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9987729138287603
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003088227358667578 R2 Score: 0.9974360743157676 RMSE: 0.055572 Entropy Value: 0.0007456443001105355
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.959778 |
| 2 | female_smokers | 0.023462 |
| 0 | cardiovasc_death_rate | 0.012141 |
| 5 | aged_65_older | 0.002635 |
| 3 | male_smokers | 0.001706 |
| 4 | life_expectancy | 0.000277 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.229131 |
2096 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984672041343956
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0024692245202902934 R2 Score: 0.9979499863732701 RMSE: 0.049691 Entropy Value: 0.0005994641511856433
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.967631 |
| 2 | extreme_poverty | 0.025845 |
| 5 | population | 0.003256 |
| 3 | gdp_per_capita | 0.001799 |
| 0 | hospital_beds_per_thousand | 0.001150 |
| 4 | population_density | 0.000319 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Finland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 0.55159 |
2127 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9960406584730894
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003447956787565997 R2 Score: 0.9969249632053334 RMSE: 0.058719 Entropy Value: 0.0011003619011085734
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.916769 |
| 0 | cardiovasc_death_rate | 0.046472 |
| 2 | female_smokers | 0.023264 |
| 5 | aged_65_older | 0.010802 |
| 3 | male_smokers | 0.001888 |
| 4 | life_expectancy | 0.000803 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Finland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.55159 |
2127 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9961020915988987
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002827297964652223 R2 Score: 0.9974784935524297 RMSE: 0.053172 Entropy Value: 0.0008687386647983583
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.955245 |
| 2 | extreme_poverty | 0.021036 |
| 5 | population | 0.017558 |
| 3 | gdp_per_capita | 0.003311 |
| 0 | hospital_beds_per_thousand | 0.002044 |
| 4 | population_density | 0.000806 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Latvia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.06 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 0.000000 |
| 8377 | France | 1/25/2020 | 86.06 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 0.000000 |
| 8378 | France | 1/26/2020 | 86.06 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 0.000000 |
| 8379 | France | 1/27/2020 | 86.06 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 0.000000 |
| 8380 | France | 1/28/2020 | 86.06 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.06 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.06 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.06 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.06 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.06 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 0.631969 |
2109 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9898387697173525
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.12061782814064277 R2 Score: 0.9894304331066966 RMSE: 0.347301 Entropy Value: 0.002083591439709351
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.576558 |
| 1 | diabetes_prevalence | 0.393405 |
| 2 | female_smokers | 0.019743 |
| 3 | male_smokers | 0.006207 |
| 5 | aged_65_older | 0.002619 |
| 4 | life_expectancy | 0.001468 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Latvia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.70 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2109 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9912037531723439
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.13274985547442472 R2 Score: 0.9883673209910792 RMSE: 0.364349 Entropy Value: 0.0023789841094078866
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.949555 |
| 2 | extreme_poverty | 0.021708 |
| 5 | population | 0.010848 |
| 3 | gdp_per_capita | 0.009784 |
| 0 | hospital_beds_per_thousand | 0.006611 |
| 4 | population_density | 0.001495 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Romania'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 2.036403 |
2075 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9986576037239662
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0038241030962780373 R2 Score: 0.999504068554037 RMSE: 0.061839 Entropy Value: 0.00011032695266333635
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.956911 |
| 2 | female_smokers | 0.023969 |
| 0 | cardiovasc_death_rate | 0.010972 |
| 5 | aged_65_older | 0.005894 |
| 3 | male_smokers | 0.001919 |
| 4 | life_expectancy | 0.000335 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Romania'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.320 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.320 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.320 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.320 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.320 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2075 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9986786040355483
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003894529879047989 R2 Score: 0.9994949352081688 RMSE: 0.062406 Entropy Value: 0.00013511012619462034
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.966297 |
| 2 | extreme_poverty | 0.024977 |
| 5 | population | 0.006146 |
| 3 | gdp_per_capita | 0.002094 |
| 4 | population_density | 0.000341 |
| 0 | hospital_beds_per_thousand | 0.000146 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 0.716205 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9926958790865121
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0007778872294668714 R2 Score: 0.9968165123804168 RMSE: 0.027891 Entropy Value: 0.000446539171870253
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.871034 |
| 5 | aged_65_older | 0.063863 |
| 2 | female_smokers | 0.029942 |
| 0 | cardiovasc_death_rate | 0.029801 |
| 3 | male_smokers | 0.003667 |
| 4 | life_expectancy | 0.001694 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.820 | 0.860 | 0.70 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.716205 |
2067 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9928368516844188
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0006865654195289836 R2 Score: 0.9971902450248445 RMSE: 0.026202 Entropy Value: 0.0004341579931648298
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.877517 |
| 5 | population | 0.080276 |
| 2 | extreme_poverty | 0.031016 |
| 0 | hospital_beds_per_thousand | 0.005715 |
| 3 | gdp_per_capita | 0.003793 |
| 4 | population_density | 0.001682 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9966517027412476
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.024355474033570693 R2 Score: 0.9953469269223459 RMSE: 0.156062 Entropy Value: 0.0010087574199164379
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.908712 |
| 5 | aged_65_older | 0.037086 |
| 0 | cardiovasc_death_rate | 0.030881 |
| 2 | female_smokers | 0.019542 |
| 3 | male_smokers | 0.002968 |
| 4 | life_expectancy | 0.000811 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2102 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9964944545383589
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.025985308865571106 R2 Score: 0.9950355496702608 RMSE: 0.161200 Entropy Value: 0.0009864073077925806
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.938681 |
| 5 | population | 0.038091 |
| 2 | extreme_poverty | 0.018932 |
| 3 | gdp_per_capita | 0.003068 |
| 4 | population_density | 0.000901 |
| 0 | hospital_beds_per_thousand | 0.000327 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 0.00000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 0.00000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 0.00000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 0.00000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 0.11011 |
2063 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9765435752469307
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0006815282511744329 R2 Score: 0.9965253284287292 RMSE: 0.026106 Entropy Value: 0.0005097313778928398
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.560563 |
| 0 | cardiovasc_death_rate | 0.384111 |
| 2 | female_smokers | 0.029504 |
| 5 | aged_65_older | 0.010627 |
| 4 | life_expectancy | 0.007998 |
| 3 | male_smokers | 0.007196 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Iceland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2063 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9766546149550385
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0006872215957745136 R2 Score: 0.9964963017484805 RMSE: 0.026215 Entropy Value: 0.0005144780719953724
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.918759 |
| 2 | extreme_poverty | 0.029885 |
| 5 | population | 0.021024 |
| 0 | hospital_beds_per_thousand | 0.013524 |
| 3 | gdp_per_capita | 0.010039 |
| 4 | population_density | 0.006769 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998042071930531
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0021993205246510017 R2 Score: 0.9990368656566249 RMSE: 0.046897 Entropy Value: 0.0004146683405752828
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.727137 |
| 5 | aged_65_older | 0.225068 |
| 2 | female_smokers | 0.029176 |
| 1 | diabetes_prevalence | 0.016057 |
| 3 | male_smokers | 0.002199 |
| 4 | life_expectancy | 0.000364 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
2076 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998351364576265
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001882911265242772 R2 Score: 0.9991754287359407 RMSE: 0.043393 Entropy Value: 0.0004605212242426924
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | population | 0.523426 |
| 1 | human_development_index | 0.437981 |
| 2 | extreme_poverty | 0.036239 |
| 3 | gdp_per_capita | 0.001743 |
| 4 | population_density | 0.000594 |
| 0 | hospital_beds_per_thousand | 0.000017 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 6 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9610949677729353
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.619699869346872 R2 Score: 0.9744471130217611 RMSE: 0.787210 Entropy Value: 0.006087083561782794
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.778084 |
| 1 | diabetes_prevalence | 0.091493 |
| 5 | aged_65_older | 0.053859 |
| 2 | female_smokers | 0.032591 |
| 3 | male_smokers | 0.028179 |
| 4 | life_expectancy | 0.015794 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 9 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 6 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 6 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9638248976625985
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.908396956844949 R2 Score: 0.9625428922647679 RMSE: 0.953099 Entropy Value: 0.007020342279931816
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.840735 |
| 2 | extreme_poverty | 0.059561 |
| 5 | population | 0.058444 |
| 3 | gdp_per_capita | 0.032403 |
| 4 | population_density | 0.008650 |
| 0 | hospital_beds_per_thousand | 0.000206 |
# Country Pair by Pair Analysis relative to population density
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on population density (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Canada = df[(df.location == "Canada")]
df_Estonia = df[(df.location == "Estonia")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Latvia = df[(df.location == "Latvia")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]
df_France = df[(df.location == "France")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Belgium = df[(df.location == "Belgium")]
df_Italy = df[(df.location == "Italy")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Canada'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.971114273144166
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005813044515929637 R2 Score: 0.9981006045680973 RMSE: 0.076243 Entropy Value: 0.00055910741258311
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.566301 |
| 0 | cardiovasc_death_rate | 0.279674 |
| 5 | aged_65_older | 0.084089 |
| 2 | female_smokers | 0.032316 |
| 6 | median_age | 0.020659 |
| 4 | life_expectancy | 0.010257 |
| 3 | male_smokers | 0.006704 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Bulgaria'
country2 = 'Canada'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 18563.307 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.500 | 0.929 | 0.5 | 44017.591 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.500 | 0.929 | 0.5 | 44017.591 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.500 | 0.929 | 0.5 | 44017.591 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.500 | 0.929 | 0.5 | 44017.591 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.500 | 0.929 | 0.5 | 44017.591 | 38454328 | 1.093162 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9703778977447899
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00667871709917659 R2 Score: 0.9978177485628428 RMSE: 0.081723 Entropy Value: 0.0005102562409853038
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.877965 |
| 0 | hospital_beds_per_thousand | 0.051438 |
| 2 | extreme_poverty | 0.045893 |
| 4 | population | 0.013181 |
| 3 | gdp_per_capita | 0.011522 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Finland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.55159 |
2127 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9960837123353512
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0032302526563769266 R2 Score: 0.9971191211530697 RMSE: 0.056835 Entropy Value: 0.0010010015339330632
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.916992 |
| 0 | cardiovasc_death_rate | 0.044299 |
| 2 | female_smokers | 0.018955 |
| 5 | aged_65_older | 0.012105 |
| 6 | median_age | 0.005019 |
| 3 | male_smokers | 0.001906 |
| 4 | life_expectancy | 0.000725 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Finland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 1326064 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 1326064 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 1326064 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 1326064 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.50 | 29481.252 | 1326064 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8371 | Finland | 12/25/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 5540745 | 0.55159 |
| 8372 | Finland | 12/26/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 5540745 | 0.55159 |
| 8373 | Finland | 12/27/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 5540745 | 0.55159 |
| 8374 | Finland | 12/28/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 5540745 | 0.55159 |
| 8375 | Finland | 12/29/2022 | 3.28 | 0.938 | 0.04 | 40585.721 | 5540745 | 0.55159 |
2127 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9951415575046922
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004361670114229664 R2 Score: 0.9961100741935568 RMSE: 0.066043 Entropy Value: 0.0013594739970544801
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.965318 |
| 2 | extreme_poverty | 0.022507 |
| 0 | hospital_beds_per_thousand | 0.006656 |
| 3 | gdp_per_capita | 0.003913 |
| 4 | population | 0.001606 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Ireland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.00000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.00000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.00000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.00000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2071 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.998195637112943
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0024494180666809937 R2 Score: 0.9990910741405081 RMSE: 0.049492 Entropy Value: 0.0005680141052599645
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.519423 |
| 0 | cardiovasc_death_rate | 0.452298 |
| 2 | female_smokers | 0.018856 |
| 5 | aged_65_older | 0.005584 |
| 6 | median_age | 0.001787 |
| 3 | male_smokers | 0.001295 |
| 4 | life_expectancy | 0.000757 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Ireland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 5023108 | 0.00000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 5023108 | 0.00000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 5023108 | 0.00000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 5023108 | 0.00000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 0.2 | 67335.293 | 5023108 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 0.2 | 46482.958 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 0.2 | 46482.958 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 0.2 | 46482.958 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 0.2 | 46482.958 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 0.2 | 46482.958 | 372903 | 0.11011 |
2071 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9980815151576168
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002755488974254762 R2 Score: 0.9989774978725299 RMSE: 0.052493 Entropy Value: 0.0008580065109373093
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.927514 |
| 0 | hospital_beds_per_thousand | 0.047998 |
| 2 | extreme_poverty | 0.021699 |
| 3 | gdp_per_capita | 0.001706 |
| 4 | population | 0.001083 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Romania'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2076 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9968109992321585
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007447771121020153 R2 Score: 0.9949271611600027 RMSE: 0.086300 Entropy Value: 0.0005423752242841923
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.849425 |
| 6 | median_age | 0.056326 |
| 1 | diabetes_prevalence | 0.052684 |
| 5 | aged_65_older | 0.019715 |
| 2 | female_smokers | 0.015182 |
| 3 | male_smokers | 0.006200 |
| 4 | life_expectancy | 0.000468 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Romania'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 19659270 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 19659270 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 19659270 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 19659270 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 6.892 | 0.828 | 5.7 | 23313.199 | 19659270 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.570 | 0.866 | 0.7 | 25063.846 | 1850654 | 0.631969 |
2076 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9965359334331628
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0018681129196820036 R2 Score: 0.9987275876738858 RMSE: 0.043222 Entropy Value: 0.00010830270783841744
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.766405 |
| 0 | hospital_beds_per_thousand | 0.193593 |
| 2 | extreme_poverty | 0.023563 |
| 3 | gdp_per_capita | 0.015512 |
| 4 | population | 0.000927 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Spain'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2101 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9981493153041182
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006970410542749719 R2 Score: 0.9988825508533481 RMSE: 0.083489 Entropy Value: 0.00039008698511220404
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.760484 |
| 0 | cardiovasc_death_rate | 0.154926 |
| 5 | aged_65_older | 0.040161 |
| 6 | median_age | 0.027412 |
| 2 | female_smokers | 0.015343 |
| 3 | male_smokers | 0.001440 |
| 4 | life_expectancy | 0.000235 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Spain'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.806 | 0.05 | 14048.881 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.970 | 0.904 | 1.00 | 34272.360 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.970 | 0.904 | 1.00 | 34272.360 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.970 | 0.904 | 1.00 | 34272.360 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.970 | 0.904 | 1.00 | 34272.360 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.970 | 0.904 | 1.00 | 34272.360 | 47558632 | 0.855148 |
2101 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9979148999700286
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.013859495294009453 R2 Score: 0.9977781393083903 RMSE: 0.117726 Entropy Value: 0.0012691410818176068
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.930072 |
| 2 | extreme_poverty | 0.041182 |
| 0 | hospital_beds_per_thousand | 0.024001 |
| 3 | gdp_per_capita | 0.004329 |
| 4 | population | 0.000417 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9962437618415235
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.08758952328361695 R2 Score: 0.983188541421376 RMSE: 0.295955 Entropy Value: 0.003305972151973518
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.917523 |
| 0 | cardiovasc_death_rate | 0.043925 |
| 2 | female_smokers | 0.026333 |
| 6 | median_age | 0.005418 |
| 3 | male_smokers | 0.003534 |
| 5 | aged_65_older | 0.002001 |
| 4 | life_expectancy | 0.001267 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'United States'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 10549349 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 10549349 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 10549349 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 10549349 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 2.22 | 0.945 | 0.5 | 46949.283 | 10549349 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.2 | 54225.446 | 338289856 | 1.084791 |
2136 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9968676791176364
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.07653851541761683 R2 Score: 0.9853096120018122 RMSE: 0.276656 Entropy Value: 0.002994321361922591
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.961189 |
| 2 | extreme_poverty | 0.031117 |
| 3 | gdp_per_capita | 0.004991 |
| 4 | population | 0.001474 |
| 0 | hospital_beds_per_thousand | 0.001230 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Cyprus'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4148 | Cyprus | 12/25/2022 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.199679 |
| 4149 | Cyprus | 12/26/2022 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.199679 |
| 4150 | Cyprus | 12/27/2022 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.199679 |
| 4151 | Cyprus | 12/28/2022 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.199679 |
| 4152 | Cyprus | 12/29/2022 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.199679 |
2066 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9956167792156323
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0017959116723188738 R2 Score: 0.9984082585103836 RMSE: 0.042378 Entropy Value: 0.0004481618279649274
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.665281 |
| 0 | cardiovasc_death_rate | 0.297172 |
| 2 | female_smokers | 0.025477 |
| 6 | median_age | 0.006165 |
| 5 | aged_65_older | 0.002852 |
| 3 | male_smokers | 0.001868 |
| 4 | life_expectancy | 0.001184 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Cyprus'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.70 | 45436.686 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4148 | Cyprus | 12/25/2022 | 3.40 | 0.887 | 0.15 | 32415.132 | 896007 | 0.199679 |
| 4149 | Cyprus | 12/26/2022 | 3.40 | 0.887 | 0.15 | 32415.132 | 896007 | 0.199679 |
| 4150 | Cyprus | 12/27/2022 | 3.40 | 0.887 | 0.15 | 32415.132 | 896007 | 0.199679 |
| 4151 | Cyprus | 12/28/2022 | 3.40 | 0.887 | 0.15 | 32415.132 | 896007 | 0.199679 |
| 4152 | Cyprus | 12/29/2022 | 3.40 | 0.887 | 0.15 | 32415.132 | 896007 | 0.199679 |
2066 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9945135037982247
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002590042399429916 R2 Score: 0.9977044094035454 RMSE: 0.050892 Entropy Value: 0.0005194256643318901
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.902511 |
| 2 | extreme_poverty | 0.054517 |
| 0 | hospital_beds_per_thousand | 0.030059 |
| 3 | gdp_per_capita | 0.011764 |
| 4 | population | 0.001149 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.229131 |
2096 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9987756521772833
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0030527308940722715 R2 Score: 0.9974655443925158 RMSE: 0.055252 Entropy Value: 0.0007719519126651911
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.958274 |
| 2 | female_smokers | 0.023643 |
| 0 | cardiovasc_death_rate | 0.013727 |
| 3 | male_smokers | 0.001553 |
| 6 | median_age | 0.001493 |
| 5 | aged_65_older | 0.001015 |
| 4 | life_expectancy | 0.000294 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'Denmark'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 6.63 | 0.90 | 0.0 | 32605.906 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 5882259 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 5882259 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 5882259 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 5882259 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 2.50 | 0.94 | 0.2 | 46682.515 | 5882259 | 0.229131 |
2096 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985367139047165
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002246492034029117 R2 Score: 0.9981349046049655 RMSE: 0.047397 Entropy Value: 0.0005257623846396152
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.968816 |
| 2 | extreme_poverty | 0.026316 |
| 0 | hospital_beds_per_thousand | 0.002376 |
| 3 | gdp_per_capita | 0.002136 |
| 4 | population | 0.000355 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Portugal'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8377 | France | 1/25/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8378 | France | 1/26/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8379 | France | 1/27/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| 8380 | France | 1/28/2020 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
2105 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9935952287493937
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.12998101843122575 R2 Score: 0.9887399453385879 RMSE: 0.360529 Entropy Value: 0.002137830843805542
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.625213 |
| 0 | cardiovasc_death_rate | 0.328925 |
| 6 | median_age | 0.020631 |
| 2 | female_smokers | 0.015474 |
| 5 | aged_65_older | 0.005432 |
| 3 | male_smokers | 0.003226 |
| 4 | life_expectancy | 0.001100 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'France'
country2 = 'Portugal'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 8376 | France | 1/24/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 67813000 | 0.000000 |
| 8377 | France | 1/25/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 67813000 | 0.000000 |
| 8378 | France | 1/26/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 67813000 | 0.000000 |
| 8379 | France | 1/27/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 67813000 | 0.000000 |
| 8380 | France | 1/28/2020 | 5.98 | 0.901 | 0.02 | 38605.671 | 67813000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 10270857 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 10270857 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 10270857 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 10270857 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 3.39 | 0.864 | 0.50 | 27936.896 | 10270857 | 0.462977 |
2105 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9902801713528262
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.11164969909464596 R2 Score: 0.9903279591904327 RMSE: 0.334140 Entropy Value: 0.001864422558963011
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.956109 |
| 2 | extreme_poverty | 0.023120 |
| 0 | hospital_beds_per_thousand | 0.011819 |
| 3 | gdp_per_capita | 0.006569 |
| 4 | population | 0.002383 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2091 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.997363058447263
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004452525636132693 R2 Score: 0.9978154993065472 RMSE: 0.066727 Entropy Value: 0.0005847514751188148
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.864046 |
| 1 | diabetes_prevalence | 0.091568 |
| 0 | cardiovasc_death_rate | 0.032643 |
| 5 | aged_65_older | 0.008213 |
| 2 | female_smokers | 0.002508 |
| 3 | male_smokers | 0.000722 |
| 4 | life_expectancy | 0.000301 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Slovenia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 0.7 | 30155.152 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 31400.840 | 2119843 | 0.536669 |
2091 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9972517500748429
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006989074948754362 R2 Score: 0.9965710160210536 RMSE: 0.083601 Entropy Value: 0.0008646162310375566
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.929125 |
| 0 | hospital_beds_per_thousand | 0.034955 |
| 2 | extreme_poverty | 0.026035 |
| 3 | gdp_per_capita | 0.009089 |
| 4 | population | 0.000797 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Italy'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2124 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9992307049399741
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011292162553126187 R2 Score: 0.9994176200453608 RMSE: 0.106265 Entropy Value: 0.0003509709613632872
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.955787 |
| 2 | female_smokers | 0.033587 |
| 1 | diabetes_prevalence | 0.007834 |
| 5 | aged_65_older | 0.001161 |
| 3 | male_smokers | 0.000818 |
| 6 | median_age | 0.000596 |
| 4 | life_expectancy | 0.000217 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Italy'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 5.64 | 0.931 | 0.2 | 42658.576 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 35220.084 | 59037472 | 0.735109 |
2124 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9991794764886219
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.022094083557412877 R2 Score: 0.9988605237199321 RMSE: 0.148641 Entropy Value: 0.0005922597227633427
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.958815 |
| 2 | extreme_poverty | 0.034811 |
| 0 | hospital_beds_per_thousand | 0.004344 |
| 3 | gdp_per_capita | 0.001683 |
| 4 | population | 0.000347 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
2078 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9987383944393313
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010694640005461605 R2 Score: 0.9986061376968532 RMSE: 0.103415 Entropy Value: 0.001142833048385961
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.778276 |
| 0 | cardiovasc_death_rate | 0.164676 |
| 5 | aged_65_older | 0.037707 |
| 2 | female_smokers | 0.013496 |
| 3 | male_smokers | 0.002915 |
| 6 | median_age | 0.002097 |
| 4 | life_expectancy | 0.000833 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Netherlands'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 48472.545 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 647601 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 647601 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 647601 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 647601 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 0.2 | 94277.965 | 647601 | 0.377872 |
2078 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985686351034438
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006943843587888249 R2 Score: 0.9990949894703176 RMSE: 0.083330 Entropy Value: 0.0007068680961652596
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.951584 |
| 2 | extreme_poverty | 0.037349 |
| 3 | gdp_per_capita | 0.007125 |
| 0 | hospital_beds_per_thousand | 0.003327 |
| 4 | population | 0.000615 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.322149 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best CV score: 0.9632046791137154
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.38500249959391253 R2 Score: 0.9847974633914387 RMSE: 0.620486 Entropy Value: 0.004637356948453654
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.817592 |
| 5 | aged_65_older | 0.046701 |
| 6 | median_age | 0.039390 |
| 1 | diabetes_prevalence | 0.033816 |
| 2 | female_smokers | 0.029796 |
| 3 | male_smokers | 0.020359 |
| 4 | life_expectancy | 0.012346 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United Kingdom'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.932 | 0.20 | 39753.244 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14644 | Switzerland | 12/25/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 8740471 | 0.322922 |
| 14645 | Switzerland | 12/26/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 8740471 | 0.322922 |
| 14646 | Switzerland | 12/27/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 8740471 | 0.322922 |
| 14647 | Switzerland | 12/28/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 8740471 | 0.323082 |
| 14648 | Switzerland | 12/29/2022 | 4.53 | 0.955 | 0.03 | 57410.166 | 8740471 | 0.322149 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9540507391434925
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 1.6419846927529824 R2 Score: 0.9351631939309383 RMSE: 1.281400 Entropy Value: 0.008444377761462866
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.861841 |
| 2 | extreme_poverty | 0.061051 |
| 3 | gdp_per_capita | 0.049401 |
| 4 | population | 0.022082 |
| 0 | hospital_beds_per_thousand | 0.005626 |
# Country Pair by Pair Analysis relative to hospital beds per thousand
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on hospital beds per thousand (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Czechia = df[(df.location == "Czechia")]
df_France = df[(df.location == "France")]
df_Romania = df[(df.location == "Romania")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]
df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Portugal = df[(df.location == "Portugal")]
df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Bulgaria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3121 | Bulgaria | 12/25/2022 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 2.949845 |
| 3122 | Bulgaria | 12/26/2022 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 2.950107 |
| 3123 | Bulgaria | 12/27/2022 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 2.949883 |
| 3124 | Bulgaria | 12/28/2022 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 2.949716 |
| 3125 | Bulgaria | 12/29/2022 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 2.949605 |
2066 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9548694967552779
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003026954642565577 R2 Score: 0.9983335176391287 RMSE: 0.055018 Entropy Value: 0.0007808314682988706
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.791317 |
| 6 | median_age | 0.093374 |
| 1 | diabetes_prevalence | 0.051696 |
| 2 | female_smokers | 0.025939 |
| 3 | male_smokers | 0.015921 |
| 5 | aged_65_older | 0.012669 |
| 4 | life_expectancy | 0.009084 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Bulgaria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3121 | Bulgaria | 12/25/2022 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 2.949845 |
| 3122 | Bulgaria | 12/26/2022 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 2.950107 |
| 3123 | Bulgaria | 12/27/2022 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 2.949883 |
| 3124 | Bulgaria | 12/28/2022 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 2.949716 |
| 3125 | Bulgaria | 12/29/2022 | 0.816 | 1.5 | 18563.307 | 65.180 | 6781955 | 2.949605 |
2066 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9452216055850846
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.03325057701492528 R2 Score: 0.9816939774039035 RMSE: 0.182347 Entropy Value: 0.0031574369706604878
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.616534 |
| 0 | human_development_index | 0.327116 |
| 2 | gdp_per_capita | 0.035922 |
| 3 | population_density | 0.010367 |
| 4 | population | 0.010062 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'France'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411892 |
2105 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9956602391751492
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.125947936501202 R2 Score: 0.9878566963057185 RMSE: 0.354891 Entropy Value: 0.002084045307312263
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | aged_65_older | 0.902459 |
| 1 | diabetes_prevalence | 0.079682 |
| 2 | female_smokers | 0.009289 |
| 3 | male_smokers | 0.003096 |
| 6 | median_age | 0.002737 |
| 0 | cardiovasc_death_rate | 0.002483 |
| 4 | life_expectancy | 0.000254 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Czechia'
country2 = 'France'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 4153 | Czechia | 3/1/2020 | 0.900 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4154 | Czechia | 3/2/2020 | 0.900 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4155 | Czechia | 3/3/2020 | 0.900 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4156 | Czechia | 3/4/2020 | 0.900 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| 4157 | Czechia | 3/5/2020 | 0.900 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 0.901 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411892 |
2105 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9939471991048846
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.14908620050795524 R2 Score: 0.9856258144461353 RMSE: 0.386117 Entropy Value: 0.0025994670043627145
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.949247 |
| 2 | gdp_per_capita | 0.024344 |
| 0 | human_development_index | 0.017391 |
| 3 | population_density | 0.007825 |
| 4 | population | 0.001193 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Slovakia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.07 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.85 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.85 | 43.0 | 2.036403 |
2067 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975069670951232
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001889438938278478 R2 Score: 0.9989346063878206 RMSE: 0.043468 Entropy Value: 0.00017683320239097805
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.766856 |
| 1 | diabetes_prevalence | 0.188484 |
| 5 | aged_65_older | 0.019559 |
| 2 | female_smokers | 0.017919 |
| 6 | median_age | 0.003669 |
| 3 | male_smokers | 0.002809 |
| 4 | life_expectancy | 0.000703 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Slovakia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 0.860 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 0.828 | 5.7 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2067 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9977554150263899
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0016264328679556177 R2 Score: 0.9990829070190872 RMSE: 0.040329 Entropy Value: 0.0001602501218235589
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.623249 |
| 0 | human_development_index | 0.351020 |
| 2 | gdp_per_capita | 0.019462 |
| 3 | population_density | 0.005409 |
| 4 | population | 0.000861 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.466423 |
2121 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984633650339623
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01970524664080637 R2 Score: 0.9984388662695985 RMSE: 0.140375 Entropy Value: 0.0008164149340349968
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.725259 |
| 0 | cardiovasc_death_rate | 0.226135 |
| 2 | female_smokers | 0.037629 |
| 5 | aged_65_older | 0.005417 |
| 6 | median_age | 0.002877 |
| 3 | male_smokers | 0.002343 |
| 4 | life_expectancy | 0.000341 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Belgium'
country2 = 'Estonia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 1039 | Belgium | 2/4/2020 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1040 | Belgium | 2/5/2020 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1041 | Belgium | 2/6/2020 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1042 | Belgium | 2/7/2020 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| 1043 | Belgium | 2/8/2020 | 0.931 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7305 | Estonia | 12/25/2022 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7306 | Estonia | 12/26/2022 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.464100 |
| 7307 | Estonia | 12/27/2022 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.463645 |
| 7308 | Estonia | 12/28/2022 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
| 7309 | Estonia | 12/29/2022 | 0.892 | 0.5 | 29481.252 | 31.033 | 1326064 | 0.466423 |
2121 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983140649374975
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01836382495651645 R2 Score: 0.9985451394199026 RMSE: 0.135513 Entropy Value: 0.001203144259433017
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.945679 |
| 2 | gdp_per_capita | 0.038606 |
| 0 | human_development_index | 0.011137 |
| 3 | population_density | 0.003912 |
| 4 | population | 0.000666 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Luxembourg'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2079 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9983315321322627
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0028103373338490153 R2 Score: 0.9929235243700243 RMSE: 0.053013 Entropy Value: 0.0010319225336165856
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.915515 |
| 6 | median_age | 0.050625 |
| 2 | female_smokers | 0.016696 |
| 0 | cardiovasc_death_rate | 0.009803 |
| 3 | male_smokers | 0.003679 |
| 5 | aged_65_older | 0.003224 |
| 4 | life_expectancy | 0.000458 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Latvia'
country2 = 'Luxembourg'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 0.916 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 0.866 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2079 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9969485269472338
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003435249287771037 R2 Score: 0.991349985791738 RMSE: 0.058611 Entropy Value: 0.0013719375765382364
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.937685 |
| 2 | gdp_per_capita | 0.038266 |
| 0 | human_development_index | 0.019146 |
| 3 | population_density | 0.003910 |
| 4 | population | 0.000993 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2100 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9949427400565657
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010607100132758094 R2 Score: 0.9938721980926469 RMSE: 0.102991 Entropy Value: 0.0015120167964044018
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.794401 |
| 0 | cardiovasc_death_rate | 0.138390 |
| 5 | aged_65_older | 0.041077 |
| 2 | female_smokers | 0.022642 |
| 6 | median_age | 0.001811 |
| 3 | male_smokers | 0.001273 |
| 4 | life_expectancy | 0.000407 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovenia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 0.806 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 0.917 | 0.00 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 0.917 | 0.00 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 0.917 | 0.00 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 0.917 | 0.00 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 0.917 | 0.00 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2100 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9971862848708511
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004953974082252353 R2 Score: 0.9971380517341916 RMSE: 0.070384 Entropy Value: 0.001138633276467043
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.943705 |
| 2 | gdp_per_capita | 0.028816 |
| 0 | human_development_index | 0.022792 |
| 3 | population_density | 0.004115 |
| 4 | population | 0.000572 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Canada'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2111 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9988028205689966
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0061427712660212285 R2 Score: 0.9981286918115353 RMSE: 0.078376 Entropy Value: 0.0009086325364058084
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.853362 |
| 0 | cardiovasc_death_rate | 0.097828 |
| 2 | female_smokers | 0.022562 |
| 5 | aged_65_older | 0.021620 |
| 6 | median_age | 0.002997 |
| 3 | male_smokers | 0.001511 |
| 4 | life_expectancy | 0.000120 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'Canada'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 0.955 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 0.929 | 0.50 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2111 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9986226086040209
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009274554366257183 R2 Score: 0.9971746384850856 RMSE: 0.096304 Entropy Value: 0.0011833353007308447
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.960005 |
| 2 | gdp_per_capita | 0.024331 |
| 0 | human_development_index | 0.012512 |
| 3 | population_density | 0.002821 |
| 4 | population | 0.000331 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Denmark'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.229131 |
2089 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9977923468629054
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001712667679856554 R2 Score: 0.9986011517667411 RMSE: 0.041384 Entropy Value: 0.00040951059147751267
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.849240 |
| 1 | diabetes_prevalence | 0.129051 |
| 5 | aged_65_older | 0.013820 |
| 0 | cardiovasc_death_rate | 0.003271 |
| 2 | female_smokers | 0.002484 |
| 3 | male_smokers | 0.001336 |
| 4 | life_expectancy | 0.000799 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Denmark'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 0.887 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6244 | Denmark | 12/25/2022 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6245 | Denmark | 12/26/2022 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.227772 |
| 6246 | Denmark | 12/27/2022 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.228905 |
| 6247 | Denmark | 12/28/2022 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.229131 |
| 6248 | Denmark | 12/29/2022 | 0.940 | 0.20 | 46682.515 | 136.520 | 5882259 | 0.229131 |
2089 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9963693732409921
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0017673867368454051 R2 Score: 0.9985564591173178 RMSE: 0.042040 Entropy Value: 0.0008057123359739224
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.930685 |
| 2 | gdp_per_capita | 0.045621 |
| 0 | human_development_index | 0.018531 |
| 3 | population_density | 0.004051 |
| 4 | population | 0.001112 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9955159123177418
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003560841308279542 R2 Score: 0.9970021670932279 RMSE: 0.059673 Entropy Value: 0.0011802071599462333
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.517316 |
| 0 | cardiovasc_death_rate | 0.432531 |
| 6 | median_age | 0.019175 |
| 5 | aged_65_older | 0.014394 |
| 2 | female_smokers | 0.011943 |
| 3 | male_smokers | 0.003277 |
| 4 | life_expectancy | 0.001363 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 0.938 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 0.949 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9954917175836794
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002995537974182363 R2 Score: 0.9974780897167169 RMSE: 0.054732 Entropy Value: 0.0010015390611765433
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.940726 |
| 0 | human_development_index | 0.036143 |
| 2 | gdp_per_capita | 0.013851 |
| 3 | population_density | 0.006981 |
| 4 | population | 0.002299 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9989134316621475
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007507485729557742 R2 Score: 0.9993832430454774 RMSE: 0.086646 Entropy Value: 0.0003063610193860753
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | aged_65_older | 0.839694 |
| 1 | diabetes_prevalence | 0.112669 |
| 6 | median_age | 0.028748 |
| 2 | female_smokers | 0.013838 |
| 0 | cardiovasc_death_rate | 0.003462 |
| 3 | male_smokers | 0.001347 |
| 4 | life_expectancy | 0.000242 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Italy'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 0.955 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 0.892 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985178533250438
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00967545011446578 R2 Score: 0.9992051398615733 RMSE: 0.098364 Entropy Value: 0.0004210493876806918
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.943896 |
| 2 | gdp_per_capita | 0.028862 |
| 0 | human_development_index | 0.023004 |
| 3 | population_density | 0.003151 |
| 4 | population | 0.001088 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Portugal'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.462977 |
2071 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9987759689263965
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006167962396446283 R2 Score: 0.999198720666993 RMSE: 0.078536 Entropy Value: 0.00030928623928154725
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.490064 |
| 1 | diabetes_prevalence | 0.465516 |
| 2 | female_smokers | 0.035163 |
| 3 | male_smokers | 0.003769 |
| 6 | median_age | 0.003047 |
| 5 | aged_65_older | 0.001992 |
| 4 | life_expectancy | 0.000448 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Portugal'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 0.944 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11513 | Portugal | 12/25/2022 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11514 | Portugal | 12/26/2022 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11515 | Portugal | 12/27/2022 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11516 | Portugal | 12/28/2022 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.462977 |
| 11517 | Portugal | 12/29/2022 | 0.864 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.462977 |
2071 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985701391894292
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006396441720377513 R2 Score: 0.9991690389425403 RMSE: 0.079978 Entropy Value: 0.00045399991497102854
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.954092 |
| 2 | gdp_per_capita | 0.038297 |
| 0 | human_development_index | 0.003655 |
| 3 | population_density | 0.003477 |
| 4 | population | 0.000478 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Sweden'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2126 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984616500795094
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.015222412063753327 R2 Score: 0.9982144571626685 RMSE: 0.123379 Entropy Value: 0.0005650826297719659
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.972202 |
| 2 | female_smokers | 0.021973 |
| 5 | aged_65_older | 0.002099 |
| 3 | male_smokers | 0.001720 |
| 0 | cardiovasc_death_rate | 0.001019 |
| 6 | median_age | 0.000697 |
| 4 | life_expectancy | 0.000290 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Sweden'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 23011 | Sweden | 2/1/2020 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23012 | Sweden | 2/2/2020 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23013 | Sweden | 2/3/2020 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23014 | Sweden | 2/4/2020 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| 23015 | Sweden | 2/5/2020 | 0.945 | 0.5 | 46949.283 | 24.718 | 10549349 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 0.904 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2126 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9984002473559823
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.021481691633920292 R2 Score: 0.9974802626239476 RMSE: 0.146566 Entropy Value: 0.0008034528795503455
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.973986 |
| 2 | gdp_per_capita | 0.022250 |
| 3 | population_density | 0.002222 |
| 0 | human_development_index | 0.001199 |
| 4 | population | 0.000344 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9639563497859113
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.483882915680185 R2 Score: 0.980047429301368 RMSE: 0.695617 Entropy Value: 0.005987538685788008
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.770054 |
| 1 | diabetes_prevalence | 0.089675 |
| 5 | aged_65_older | 0.045220 |
| 6 | median_age | 0.031215 |
| 2 | female_smokers | 0.026417 |
| 3 | male_smokers | 0.023605 |
| 4 | life_expectancy | 0.013812 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 0.932 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9563158884930116
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 2.3376129179018905 R2 Score: 0.9036101802748917 RMSE: 1.528925 Entropy Value: 0.011597152974395022
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.854034 |
| 2 | gdp_per_capita | 0.085807 |
| 3 | population_density | 0.033015 |
| 4 | population | 0.019051 |
| 0 | human_development_index | 0.008094 |
# Country Pair by Pair Analysis relative to human development index
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on human development index (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]
df_Italy = df[(df.location == "Italy")]
df_Latvia = df[(df.location == "Latvia")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Spain = df[(df.location == "Spain")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985702277201526
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.020452761128882797 R2 Score: 0.9982725374621599 RMSE: 0.143013 Entropy Value: 0.0008156180119545589
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.853422 |
| 1 | diabetes_prevalence | 0.089967 |
| 0 | cardiovasc_death_rate | 0.036597 |
| 5 | aged_65_older | 0.011508 |
| 3 | male_smokers | 0.004570 |
| 2 | female_smokers | 0.003805 |
| 4 | life_expectancy | 0.000130 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 5.64 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 5.64 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 5.64 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 5.64 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 5.64 | 0.2 | 42658.576 | 375.564 | 11655923 | 0.711787 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974451044142482
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010069518672884694 R2 Score: 0.9991495174577224 RMSE: 0.100347 Entropy Value: 0.00034019584039697294
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.935823 |
| 2 | gdp_per_capita | 0.050241 |
| 0 | hospital_beds_per_thousand | 0.009467 |
| 3 | population_density | 0.003676 |
| 4 | population | 0.000793 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2134 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9984540951428933
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002748350813295417 R2 Score: 0.9993440760119627 RMSE: 0.052425 Entropy Value: 0.00028602597187348704
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.742827 |
| 0 | cardiovasc_death_rate | 0.187586 |
| 6 | median_age | 0.034101 |
| 2 | female_smokers | 0.019551 |
| 5 | aged_65_older | 0.014214 |
| 3 | male_smokers | 0.001588 |
| 4 | life_expectancy | 0.000133 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.5 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 2.5 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 2.5 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 2.5 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 2.5 | 0.2 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.5 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2134 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985212702477563
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004149548610657574 R2 Score: 0.9990096648287803 RMSE: 0.064417 Entropy Value: 0.0005668080870825645
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.937978 |
| 0 | hospital_beds_per_thousand | 0.035832 |
| 2 | gdp_per_capita | 0.021979 |
| 3 | population_density | 0.003812 |
| 4 | population | 0.000400 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9955159123177418
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003560841308279542 R2 Score: 0.9970021670932279 RMSE: 0.059673 Entropy Value: 0.0011802071599462333
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.517316 |
| 0 | cardiovasc_death_rate | 0.432531 |
| 6 | median_age | 0.019175 |
| 5 | aged_65_older | 0.014394 |
| 2 | female_smokers | 0.011943 |
| 3 | male_smokers | 0.003277 |
| 4 | life_expectancy | 0.001363 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'Iceland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 3.28 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7311 | Finland | 1/30/2020 | 3.28 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7312 | Finland | 1/31/2020 | 3.28 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7313 | Finland | 2/1/2020 | 3.28 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| 7314 | Finland | 2/2/2020 | 3.28 | 0.04 | 40585.721 | 18.136 | 5540745 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.20 | 46482.958 | 3.404 | 372903 | 0.11011 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9954917175836794
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002995537974182363 R2 Score: 0.9974780897167169 RMSE: 0.054732 Entropy Value: 0.0010015390611765433
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.940726 |
| 0 | hospital_beds_per_thousand | 0.036143 |
| 2 | gdp_per_capita | 0.013851 |
| 3 | population_density | 0.006981 |
| 4 | population | 0.002299 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
2076 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9981832621613457
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002099363526029487 R2 Score: 0.9990806391844732 RMSE: 0.045819 Entropy Value: 0.0003826774935355706
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | aged_65_older | 0.508703 |
| 0 | cardiovasc_death_rate | 0.420926 |
| 2 | female_smokers | 0.028547 |
| 6 | median_age | 0.024134 |
| 1 | diabetes_prevalence | 0.015762 |
| 3 | male_smokers | 0.001625 |
| 4 | life_expectancy | 0.000304 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.2 | 94277.965 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.2 | 67335.293 | 69.874 | 5023108 | 0.491388 |
2076 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9981230977543898
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0038624036497071725 R2 Score: 0.9983085623212649 RMSE: 0.062148 Entropy Value: 0.0008228738012960621
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.920449 |
| 2 | gdp_per_capita | 0.043623 |
| 0 | hospital_beds_per_thousand | 0.027483 |
| 3 | population_density | 0.007933 |
| 4 | population | 0.000512 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Slovenia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9986942730831515
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003718669729409549 R2 Score: 0.999546121567054 RMSE: 0.060981 Entropy Value: 0.00037704341816569313
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.860422 |
| 5 | aged_65_older | 0.118967 |
| 2 | female_smokers | 0.012216 |
| 6 | median_age | 0.002841 |
| 3 | male_smokers | 0.002676 |
| 0 | cardiovasc_death_rate | 0.002590 |
| 4 | life_expectancy | 0.000288 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Slovenia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.1 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.0 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9987719337602877
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006276054785219446 R2 Score: 0.9992339825479873 RMSE: 0.079222 Entropy Value: 0.0005192822850310188
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.968319 |
| 2 | gdp_per_capita | 0.027643 |
| 0 | hospital_beds_per_thousand | 0.001991 |
| 3 | population_density | 0.001591 |
| 4 | population | 0.000457 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9965358172344019
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.030651412600525802 R2 Score: 0.9941440982603341 RMSE: 0.175075 Entropy Value: 0.0009864234116361557
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.841516 |
| 6 | median_age | 0.093364 |
| 5 | aged_65_older | 0.036116 |
| 0 | cardiovasc_death_rate | 0.013821 |
| 2 | female_smokers | 0.009758 |
| 3 | male_smokers | 0.004457 |
| 4 | life_expectancy | 0.000968 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Sweden'
country2 = 'Switzerland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.03 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.50 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.996752968199182
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.024455205155626165 R2 Score: 0.9953278734562381 RMSE: 0.156382 Entropy Value: 0.0011192911652519264
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.961161 |
| 2 | gdp_per_capita | 0.020804 |
| 0 | hospital_beds_per_thousand | 0.011661 |
| 3 | population_density | 0.005177 |
| 4 | population | 0.001197 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9639563497859113
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.483882915680185 R2 Score: 0.980047429301368 RMSE: 0.695617 Entropy Value: 0.005987538685788008
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.770054 |
| 1 | diabetes_prevalence | 0.089675 |
| 5 | aged_65_older | 0.045220 |
| 6 | median_age | 0.031215 |
| 2 | female_smokers | 0.026417 |
| 3 | male_smokers | 0.023605 |
| 4 | life_expectancy | 0.013812 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'United States'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 12547 | United Kingdom | 2/1/2020 | 2.54 | 0.2 | 39753.244 | 272.898 | 67508936 | 50.000000 |
| 12548 | United Kingdom | 2/2/2020 | 2.54 | 0.2 | 39753.244 | 272.898 | 67508936 | 100.000000 |
| 12549 | United Kingdom | 2/3/2020 | 2.54 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12550 | United Kingdom | 2/4/2020 | 2.54 | 0.2 | 39753.244 | 272.898 | 67508936 | 25.000000 |
| 12551 | United Kingdom | 2/5/2020 | 2.54 | 0.2 | 39753.244 | 272.898 | 67508936 | 22.222222 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9563158884930116
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 2.3376129179018905 R2 Score: 0.9036101802748917 RMSE: 1.528925 Entropy Value: 0.011597152974395022
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.854034 |
| 2 | gdp_per_capita | 0.085807 |
| 3 | population_density | 0.033015 |
| 4 | population | 0.019051 |
| 0 | hospital_beds_per_thousand | 0.008094 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919575 |
2061 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9931354068766577
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00047399699097167984 R2 Score: 0.9991753265149302 RMSE: 0.021771 Entropy Value: 0.00015620474538757492
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.658891 |
| 0 | cardiovasc_death_rate | 0.219210 |
| 5 | aged_65_older | 0.052242 |
| 6 | median_age | 0.044419 |
| 2 | female_smokers | 0.022219 |
| 3 | male_smokers | 0.001522 |
| 4 | life_expectancy | 0.001498 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.15 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.63 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.63 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.63 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.63 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.63 | 0.00 | 32605.906 | 137.176 | 10493990 | 0.919575 |
2061 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9866151912407417
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008157893614715191 R2 Score: 0.9858066639953047 RMSE: 0.090321 Entropy Value: 0.002246714925831334
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.818042 |
| 0 | hospital_beds_per_thousand | 0.113415 |
| 2 | gdp_per_capita | 0.051214 |
| 3 | population_density | 0.013891 |
| 4 | population | 0.003438 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411892 |
2132 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.992829851903131
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.08964626978739043 R2 Score: 0.9906665614728145 RMSE: 0.299410 Entropy Value: 0.005413259877489937
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.795771 |
| 0 | cardiovasc_death_rate | 0.160583 |
| 2 | female_smokers | 0.024566 |
| 5 | aged_65_older | 0.009192 |
| 6 | median_age | 0.007044 |
| 3 | male_smokers | 0.002026 |
| 4 | life_expectancy | 0.000817 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'France'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.50 | 29481.252 | 31.033 | 1326064 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 5.98 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 5.98 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 5.98 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 5.98 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 5.98 | 0.02 | 38605.671 | 122.578 | 67813000 | 0.411892 |
2132 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9922426580613892
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.10037550062654846 R2 Score: 0.9895494975200281 RMSE: 0.316821 Entropy Value: 0.005971530997882069
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.949550 |
| 2 | gdp_per_capita | 0.026829 |
| 0 | hospital_beds_per_thousand | 0.016964 |
| 3 | population_density | 0.005760 |
| 4 | population | 0.000897 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Latvia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 19873 | Latvia | 1/6/2020 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.000000 |
| 19874 | Latvia | 1/18/2020 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.000000 |
| 19875 | Latvia | 2/12/2020 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.000000 |
| 19876 | Latvia | 2/29/2020 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.000000 |
| 19877 | Latvia | 3/1/2020 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9987367076025663
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.01325682921637842 R2 Score: 0.9988426183663411 RMSE: 0.115138 Entropy Value: 0.0005827591181866116
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.468791 |
| 0 | cardiovasc_death_rate | 0.462287 |
| 5 | aged_65_older | 0.047960 |
| 2 | female_smokers | 0.017033 |
| 6 | median_age | 0.002431 |
| 3 | male_smokers | 0.001293 |
| 4 | life_expectancy | 0.000205 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Latvia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 19873 | Latvia | 1/6/2020 | 5.57 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.000000 |
| 19874 | Latvia | 1/18/2020 | 5.57 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.000000 |
| 19875 | Latvia | 2/12/2020 | 5.57 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.000000 |
| 19876 | Latvia | 2/29/2020 | 5.57 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.000000 |
| 19877 | Latvia | 3/1/2020 | 5.57 | 0.7 | 25063.846 | 31.212 | 1850654 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 2.0 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9987496708836977
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.012769557121584701 R2 Score: 0.9988851594418807 RMSE: 0.113002 Entropy Value: 0.000472757332466212
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.947730 |
| 2 | gdp_per_capita | 0.025692 |
| 0 | hospital_beds_per_thousand | 0.023602 |
| 3 | population_density | 0.002542 |
| 4 | population | 0.000433 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovakia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12542 | Slovakia | 12/25/2022 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.783216 |
| 12543 | Slovakia | 12/26/2022 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.783313 |
| 12544 | Slovakia | 12/27/2022 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.783363 |
| 12545 | Slovakia | 12/28/2022 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.783459 |
| 12546 | Slovakia | 12/29/2022 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.783522 |
2063 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9972097011690918
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001539940090586167 R2 Score: 0.9978668126685196 RMSE: 0.039242 Entropy Value: 0.0003056208558480997
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.573738 |
| 0 | cardiovasc_death_rate | 0.369553 |
| 6 | median_age | 0.030975 |
| 2 | female_smokers | 0.018394 |
| 5 | aged_65_older | 0.003914 |
| 3 | male_smokers | 0.002962 |
| 4 | life_expectancy | 0.000464 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Slovakia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.5 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12542 | Slovakia | 12/25/2022 | 5.82 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.783216 |
| 12543 | Slovakia | 12/26/2022 | 5.82 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.783313 |
| 12544 | Slovakia | 12/27/2022 | 5.82 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.783363 |
| 12545 | Slovakia | 12/28/2022 | 5.82 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.783459 |
| 12546 | Slovakia | 12/29/2022 | 5.82 | 0.7 | 30155.152 | 113.128 | 5643455 | 0.783522 |
2063 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9978257589564323
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0017579514577669621 R2 Score: 0.9975648145002586 RMSE: 0.041928 Entropy Value: 0.00038885161098168246
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.940339 |
| 0 | hospital_beds_per_thousand | 0.028359 |
| 2 | gdp_per_capita | 0.025732 |
| 3 | population_density | 0.005010 |
| 4 | population | 0.000560 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Bulgaria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.855148 |
2090 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9819133940673177
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005371342469960679 R2 Score: 0.9989002795207302 RMSE: 0.073289 Entropy Value: 0.0005215588278580451
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | aged_65_older | 0.474280 |
| 1 | diabetes_prevalence | 0.397780 |
| 0 | cardiovasc_death_rate | 0.091164 |
| 2 | female_smokers | 0.019135 |
| 6 | median_age | 0.010034 |
| 3 | male_smokers | 0.005037 |
| 4 | life_expectancy | 0.002570 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'Bulgaria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 1.5 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 1.5 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 25132 | Spain | 12/25/2022 | 2.970 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25133 | Spain | 12/26/2022 | 2.970 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25134 | Spain | 12/27/2022 | 2.970 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25135 | Spain | 12/28/2022 | 2.970 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
| 25136 | Spain | 12/29/2022 | 2.970 | 1.0 | 34272.360 | 93.105 | 47558632 | 0.855148 |
2090 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9840422767394642
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0071890946885995385 R2 Score: 0.9985281156990684 RMSE: 0.084789 Entropy Value: 0.0006206494126241532
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.934779 |
| 2 | gdp_per_capita | 0.051476 |
| 4 | population | 0.007491 |
| 3 | population_density | 0.004398 |
| 0 | hospital_beds_per_thousand | 0.001856 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
2076 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9971039894626419
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002022927413440657 R2 Score: 0.9988222346389628 RMSE: 0.044977 Entropy Value: 0.00025466415334854067
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.701857 |
| 5 | aged_65_older | 0.104510 |
| 1 | diabetes_prevalence | 0.099736 |
| 6 | median_age | 0.082303 |
| 2 | female_smokers | 0.009089 |
| 3 | male_smokers | 0.002010 |
| 4 | life_expectancy | 0.000495 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Romania'
country2 = 'Serbia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 15721 | Serbia | 2/26/2020 | 5.609 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15722 | Serbia | 2/27/2020 | 5.609 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15723 | Serbia | 2/28/2020 | 5.609 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15724 | Serbia | 2/29/2020 | 5.609 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| 15725 | Serbia | 3/1/2020 | 5.609 | 0.05 | 14048.881 | 80.291 | 6871547 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 5.70 | 23313.199 | 85.129 | 19659270 | 2.036403 |
2076 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.996697918034125
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001983248045491881 R2 Score: 0.9988453363008452 RMSE: 0.044534 Entropy Value: 0.00043708292617609305
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | extreme_poverty | 0.659949 |
| 0 | hospital_beds_per_thousand | 0.309256 |
| 2 | gdp_per_capita | 0.022356 |
| 3 | population_density | 0.007392 |
| 4 | population | 0.001047 |
# Country Pair by Pair Analysis relative to extreme poverty
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on extreme poverty (13 pairs of countries)
df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]
df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Switzerland = df[(df.location == "Switzerland")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]
df_Estonia = df[(df.location == "Estonia")]
df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]
df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Portugal = df[(df.location == "Portugal")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Italy = df[(df.location == "Italy")]
df_Romania = df[(df.location == "Romania")]
df_Spain = df[(df.location == "Spain")]
df_UnitedStates = df[(df.location == "United States")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919575 |
2061 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9931354068766577
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00047399699097167984 R2 Score: 0.9991753265149302 RMSE: 0.021771 Entropy Value: 0.00015620474538757492
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.658891 |
| 0 | cardiovasc_death_rate | 0.219210 |
| 5 | aged_65_older | 0.052242 |
| 6 | median_age | 0.044419 |
| 2 | female_smokers | 0.022219 |
| 3 | male_smokers | 0.001522 |
| 4 | life_expectancy | 0.001498 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 32415.132 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 32415.132 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.63 | 0.900 | 32605.906 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.63 | 0.900 | 32605.906 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.63 | 0.900 | 32605.906 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.63 | 0.900 | 32605.906 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.63 | 0.900 | 32605.906 | 137.176 | 10493990 | 0.919575 |
2061 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9866151912407417
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008157893614715191 R2 Score: 0.9858066639953047 RMSE: 0.090321 Entropy Value: 0.002246714925831334
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.818042 |
| 0 | hospital_beds_per_thousand | 0.113415 |
| 2 | gdp_per_capita | 0.051214 |
| 3 | population_density | 0.013891 |
| 4 | population | 0.003438 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'France'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411892 |
2137 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9901279701885508
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.03485209804982485 R2 Score: 0.9965523687852408 RMSE: 0.186687 Entropy Value: 0.0021807550391466923
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.519750 |
| 0 | cardiovasc_death_rate | 0.429350 |
| 2 | female_smokers | 0.034202 |
| 5 | aged_65_older | 0.006474 |
| 3 | male_smokers | 0.004594 |
| 6 | median_age | 0.004512 |
| 4 | life_expectancy | 0.001119 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'France'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 3.28 | 0.938 | 40585.721 | 18.136 | 5540745 | 0.000000 |
| 7311 | Finland | 1/30/2020 | 3.28 | 0.938 | 40585.721 | 18.136 | 5540745 | 0.000000 |
| 7312 | Finland | 1/31/2020 | 3.28 | 0.938 | 40585.721 | 18.136 | 5540745 | 0.000000 |
| 7313 | Finland | 2/1/2020 | 3.28 | 0.938 | 40585.721 | 18.136 | 5540745 | 0.000000 |
| 7314 | Finland | 2/2/2020 | 3.28 | 0.938 | 40585.721 | 18.136 | 5540745 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 5.98 | 0.901 | 38605.671 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 5.98 | 0.901 | 38605.671 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 5.98 | 0.901 | 38605.671 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 5.98 | 0.901 | 38605.671 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 5.98 | 0.901 | 38605.671 | 122.578 | 67813000 | 0.411892 |
2137 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9910318902554458
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.035576227278599215 R2 Score: 0.9964807366404823 RMSE: 0.188617 Entropy Value: 0.0016675150996863651
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.937639 |
| 2 | gdp_per_capita | 0.039984 |
| 0 | hospital_beds_per_thousand | 0.012847 |
| 3 | population_density | 0.007729 |
| 4 | population | 0.001800 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Serbia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716205 |
2075 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9988517103371555
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005307061778710648 R2 Score: 0.9992985438334173 RMSE: 0.072850 Entropy Value: 0.00033274960964428744
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.956279 |
| 1 | diabetes_prevalence | 0.033001 |
| 5 | aged_65_older | 0.007184 |
| 0 | cardiovasc_death_rate | 0.001792 |
| 2 | female_smokers | 0.000972 |
| 3 | male_smokers | 0.000687 |
| 4 | life_expectancy | 0.000085 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Serbia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.320 | 0.944 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.320 | 0.944 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.320 | 0.944 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.320 | 0.944 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.320 | 0.944 | 48472.545 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 14048.881 | 80.291 | 6871547 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 14048.881 | 80.291 | 6871547 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 14048.881 | 80.291 | 6871547 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 14048.881 | 80.291 | 6871547 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 14048.881 | 80.291 | 6871547 | 0.716205 |
2075 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.998367181995075
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.007612509437698923 R2 Score: 0.9989938233412576 RMSE: 0.087250 Entropy Value: 0.0006561539548214934
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.955071 |
| 2 | gdp_per_capita | 0.039929 |
| 3 | population_density | 0.002952 |
| 0 | hospital_beds_per_thousand | 0.001518 |
| 4 | population | 0.000531 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Switzerland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2101 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998589125309279
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0035706853382243314 R2 Score: 0.9987381127552114 RMSE: 0.059755 Entropy Value: 0.0005437521483565165
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.962368 |
| 2 | female_smokers | 0.022123 |
| 0 | cardiovasc_death_rate | 0.012033 |
| 3 | male_smokers | 0.001968 |
| 6 | median_age | 0.000829 |
| 5 | aged_65_older | 0.000417 |
| 4 | life_expectancy | 0.000261 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Switzerland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 57410.166 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 31400.840 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 31400.840 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 31400.840 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 31400.840 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 31400.840 | 102.619 | 2119843 | 0.536669 |
2101 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9984365005501525
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0053398329862075255 R2 Score: 0.9981128924852428 RMSE: 0.073074 Entropy Value: 0.0005566431428928525
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.973207 |
| 2 | gdp_per_capita | 0.022780 |
| 3 | population_density | 0.002330 |
| 0 | hospital_beds_per_thousand | 0.001255 |
| 4 | population | 0.000427 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985702277201526
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.020452761128882797 R2 Score: 0.9982725374621599 RMSE: 0.143013 Entropy Value: 0.0008156180119545589
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.853422 |
| 1 | diabetes_prevalence | 0.089967 |
| 0 | cardiovasc_death_rate | 0.036597 |
| 5 | aged_65_older | 0.011508 |
| 3 | male_smokers | 0.004570 |
| 2 | female_smokers | 0.003805 |
| 4 | life_expectancy | 0.000130 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 45436.686 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 5.64 | 0.931 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 5.64 | 0.931 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 5.64 | 0.931 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 5.64 | 0.931 | 42658.576 | 375.564 | 11655923 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 5.64 | 0.931 | 42658.576 | 375.564 | 11655923 | 0.711787 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974451044142482
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010069518672884694 R2 Score: 0.9991495174577224 RMSE: 0.100347 Entropy Value: 0.00034019584039697294
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.935823 |
| 2 | gdp_per_capita | 0.050241 |
| 0 | hospital_beds_per_thousand | 0.009467 |
| 3 | population_density | 0.003676 |
| 4 | population | 0.000793 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2134 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9984540951428933
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002748350813295417 R2 Score: 0.9993440760119627 RMSE: 0.052425 Entropy Value: 0.00028602597187348704
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.742827 |
| 0 | cardiovasc_death_rate | 0.187586 |
| 6 | median_age | 0.034101 |
| 2 | female_smokers | 0.019551 |
| 5 | aged_65_older | 0.014214 |
| 3 | male_smokers | 0.001588 |
| 4 | life_expectancy | 0.000133 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.5 | 0.940 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 2.5 | 0.940 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 2.5 | 0.940 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 2.5 | 0.940 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 2.5 | 0.940 | 46682.515 | 136.520 | 5882259 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 44017.591 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 44017.591 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 44017.591 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 44017.591 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 44017.591 | 4.037 | 38454328 | 1.093162 |
2134 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985212702477563
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004149548610657574 R2 Score: 0.9990096648287803 RMSE: 0.064417 Entropy Value: 0.0005668080870825645
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.937978 |
| 0 | hospital_beds_per_thousand | 0.035832 |
| 2 | gdp_per_capita | 0.021979 |
| 3 | population_density | 0.003812 |
| 4 | population | 0.000400 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Iceland'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.11011 |
2097 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9984258099692642
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0015282099631709807 R2 Score: 0.9971614614419819 RMSE: 0.039092 Entropy Value: 0.0011429224525441673
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.563638 |
| 0 | cardiovasc_death_rate | 0.377660 |
| 6 | median_age | 0.054616 |
| 5 | aged_65_older | 0.002951 |
| 2 | female_smokers | 0.000807 |
| 3 | male_smokers | 0.000252 |
| 4 | life_expectancy | 0.000076 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Iceland'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 29481.252 | 31.033 | 1326064 | 0.00000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 21942 | Iceland | 12/25/2022 | 2.91 | 0.949 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21943 | Iceland | 12/26/2022 | 2.91 | 0.949 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21944 | Iceland | 12/27/2022 | 2.91 | 0.949 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21945 | Iceland | 12/28/2022 | 2.91 | 0.949 | 46482.958 | 3.404 | 372903 | 0.11011 |
| 21946 | Iceland | 12/29/2022 | 2.91 | 0.949 | 46482.958 | 3.404 | 372903 | 0.11011 |
2097 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9967186075484848
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.003177612198314735 R2 Score: 0.9940978170770269 RMSE: 0.056370 Entropy Value: 0.001954468736081089
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.915291 |
| 0 | hospital_beds_per_thousand | 0.046809 |
| 2 | gdp_per_capita | 0.035884 |
| 3 | population_density | 0.001125 |
| 4 | population | 0.000891 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Latvia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2073 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9980552037491671
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0032364660760083627 R2 Score: 0.9985316671525546 RMSE: 0.056890 Entropy Value: 0.0004855826815172552
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.739516 |
| 0 | cardiovasc_death_rate | 0.219378 |
| 2 | female_smokers | 0.032388 |
| 5 | aged_65_older | 0.003260 |
| 3 | male_smokers | 0.002877 |
| 6 | median_age | 0.002109 |
| 4 | life_expectancy | 0.000472 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Latvia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 18838 | Ireland | 2/29/2020 | 2.96 | 0.955 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18839 | Ireland | 3/1/2020 | 2.96 | 0.955 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18840 | Ireland | 3/2/2020 | 2.96 | 0.955 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18841 | Ireland | 3/3/2020 | 2.96 | 0.955 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| 18842 | Ireland | 3/4/2020 | 2.96 | 0.955 | 67335.293 | 69.874 | 5023108 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 25063.846 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 25063.846 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 25063.846 | 31.212 | 1850654 | 0.631969 |
2073 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9977520979997934
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006423810097061833 R2 Score: 0.9970856201950677 RMSE: 0.080149 Entropy Value: 0.0011599657894503636
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.948187 |
| 2 | gdp_per_capita | 0.038317 |
| 0 | hospital_beds_per_thousand | 0.009184 |
| 3 | population_density | 0.003609 |
| 4 | population | 0.000702 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Portugal'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.377872 |
2075 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9965116751526303
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001917842434237941 R2 Score: 0.9976831679134813 RMSE: 0.043793 Entropy Value: 0.00042605878052921417
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.772730 |
| 0 | cardiovasc_death_rate | 0.186688 |
| 2 | female_smokers | 0.025317 |
| 5 | aged_65_older | 0.008044 |
| 6 | median_age | 0.004079 |
| 3 | male_smokers | 0.002494 |
| 4 | life_expectancy | 0.000648 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Luxembourg'
country2 = 'Portugal'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.39 | 0.864 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.39 | 0.864 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.39 | 0.864 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.39 | 0.864 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.39 | 0.864 | 27936.896 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17795 | Luxembourg | 12/25/2022 | 4.51 | 0.916 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17796 | Luxembourg | 12/26/2022 | 4.51 | 0.916 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17797 | Luxembourg | 12/27/2022 | 4.51 | 0.916 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17798 | Luxembourg | 12/28/2022 | 4.51 | 0.916 | 94277.965 | 231.447 | 647601 | 0.377872 |
| 17799 | Luxembourg | 12/29/2022 | 4.51 | 0.916 | 94277.965 | 231.447 | 647601 | 0.377872 |
2075 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9970475062842402
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.001444462921026543 R2 Score: 0.9982550297232574 RMSE: 0.038006 Entropy Value: 0.00038138798942089766
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.915538 |
| 0 | hospital_beds_per_thousand | 0.041139 |
| 2 | gdp_per_capita | 0.037230 |
| 3 | population_density | 0.005518 |
| 4 | population | 0.000575 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Sweden'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2092 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9956178650087402
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.028400997059026503 R2 Score: 0.993865518212125 RMSE: 0.168526 Entropy Value: 0.001669275065011816
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.547703 |
| 0 | cardiovasc_death_rate | 0.411195 |
| 5 | aged_65_older | 0.017001 |
| 6 | median_age | 0.011947 |
| 2 | female_smokers | 0.008400 |
| 3 | male_smokers | 0.003192 |
| 4 | life_expectancy | 0.000561 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovakia'
country2 = 'Sweden'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.82 | 0.860 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.82 | 0.860 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.82 | 0.860 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.82 | 0.860 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.82 | 0.860 | 30155.152 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 46949.283 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 46949.283 | 24.718 | 10549349 | 0.816005 |
2092 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9937432736131026
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.043275743993950025 R2 Score: 0.990652642833775 RMSE: 0.208028 Entropy Value: 0.0026706049946896467
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.967774 |
| 2 | gdp_per_capita | 0.018507 |
| 0 | hospital_beds_per_thousand | 0.008734 |
| 3 | population_density | 0.003941 |
| 4 | population | 0.001044 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Bulgaria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
2090 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.944267642996866
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 1.43474289655929 R2 Score: 0.9257898367511572 RMSE: 1.197808 Entropy Value: 0.01142862992057888
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.867049 |
| 6 | median_age | 0.035735 |
| 2 | female_smokers | 0.027527 |
| 3 | male_smokers | 0.020658 |
| 4 | life_expectancy | 0.018178 |
| 0 | cardiovasc_death_rate | 0.016531 |
| 5 | aged_65_older | 0.014323 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Bulgaria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 18563.307 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 18563.307 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 2.540 | 0.932 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 2.540 | 0.932 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 2.540 | 0.932 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 2.540 | 0.932 | 39753.244 | 272.898 | 67508936 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 2.540 | 0.932 | 39753.244 | 272.898 | 67508936 | 0.883564 |
2090 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9339708697999024
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.7342554973835562 R2 Score: 0.9620216134487464 RMSE: 0.856887 Entropy Value: 0.007601507191777842
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.881448 |
| 2 | gdp_per_capita | 0.054324 |
| 4 | population | 0.037373 |
| 3 | population_density | 0.025757 |
| 0 | hospital_beds_per_thousand | 0.001098 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Romania'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2102 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9987132285641792
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009270441753640613 R2 Score: 0.9991156527114821 RMSE: 0.096283 Entropy Value: 0.00027465585171992864
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.531054 |
| 0 | cardiovasc_death_rate | 0.434367 |
| 5 | aged_65_older | 0.018956 |
| 2 | female_smokers | 0.010941 |
| 3 | male_smokers | 0.002241 |
| 6 | median_age | 0.002159 |
| 4 | life_expectancy | 0.000282 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Italy'
country2 = 'Romania'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 17800 | Romania | 2/26/2020 | 6.892 | 0.828 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17801 | Romania | 2/27/2020 | 6.892 | 0.828 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17802 | Romania | 2/28/2020 | 6.892 | 0.828 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17803 | Romania | 2/29/2020 | 6.892 | 0.828 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| 17804 | Romania | 3/1/2020 | 6.892 | 0.828 | 23313.199 | 85.129 | 19659270 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.180 | 0.892 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.180 | 0.892 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.180 | 0.892 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.180 | 0.892 | 35220.084 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.180 | 0.892 | 35220.084 | 205.859 | 59037472 | 0.735109 |
2102 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.998323341861008
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011774878731579577 R2 Score: 0.9988767437026601 RMSE: 0.108512 Entropy Value: 0.0002876750864884934
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.966681 |
| 2 | gdp_per_capita | 0.025068 |
| 0 | hospital_beds_per_thousand | 0.003943 |
| 3 | population_density | 0.003464 |
| 4 | population | 0.000845 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2136 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975684860546954
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.09278060807270389 R2 Score: 0.9843829874630111 RMSE: 0.304599 Entropy Value: 0.0033059645967402937
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.910893 |
| 0 | cardiovasc_death_rate | 0.056002 |
| 2 | female_smokers | 0.019922 |
| 5 | aged_65_older | 0.005333 |
| 6 | median_age | 0.005246 |
| 3 | male_smokers | 0.002155 |
| 4 | life_expectancy | 0.000449 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Spain'
country2 = 'United States'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | gdp_per_capita | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 2.97 | 0.904 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 2.97 | 0.904 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 2.97 | 0.904 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 2.97 | 0.904 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 2.97 | 0.904 | 34272.360 | 93.105 | 47558632 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 54225.446 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 54225.446 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 54225.446 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 54225.446 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 54225.446 | 35.608 | 338289856 | 1.084791 |
2136 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974620993069001
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.06762065995368763 R2 Score: 0.988617958901189 RMSE: 0.260040 Entropy Value: 0.0024049365914767576
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.959159 |
| 2 | gdp_per_capita | 0.027860 |
| 0 | hospital_beds_per_thousand | 0.008511 |
| 3 | population_density | 0.003790 |
| 4 | population | 0.000679 |
# Country Pair by Pair Analysis relative to gdp_per_capita
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | stringency_index | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 11.11 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27269 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086136 | United States |
| 27270 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.086032 | United States |
| 27271 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.085212 | United States |
| 27272 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084986 | United States |
| 27273 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 23.18 | 1.084791 | United States |
27274 rows × 18 columns
# Showing the pairings of countries based on gdp_per_capita (13 pairs of countries)
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]
df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]
df_Italy = df[(df.location == "Italy")]
df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]
df_Latvia = df[(df.location == "Latvia")]
df_Portugal = df[(df.location == "Portugal")]
df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)
# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 128.275 | 4.42 | 20.9 | 26.0 | 82.25 | 14.312 | 39.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 126.459 | 3.28 | 23.0 | 25.7 | 82.30 | 13.928 | 38.7 | 0.491388 |
2076 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9981832621613457
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002099363526029487 R2 Score: 0.9990806391844732 RMSE: 0.045819 Entropy Value: 0.0003826774935355706
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 5 | aged_65_older | 0.508703 |
| 0 | cardiovasc_death_rate | 0.420926 |
| 2 | female_smokers | 0.028547 |
| 6 | median_age | 0.024134 |
| 1 | diabetes_prevalence | 0.015762 |
| 3 | male_smokers | 0.001625 |
| 4 | life_expectancy | 0.000304 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Ireland'
country2 = 'Luxembourg'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 16759 | Luxembourg | 2/12/2020 | 4.51 | 0.916 | 0.2 | 231.447 | 647601 | 0.000000 |
| 16760 | Luxembourg | 2/24/2020 | 4.51 | 0.916 | 0.2 | 231.447 | 647601 | 0.000000 |
| 16761 | Luxembourg | 2/25/2020 | 4.51 | 0.916 | 0.2 | 231.447 | 647601 | 0.000000 |
| 16762 | Luxembourg | 2/26/2020 | 4.51 | 0.916 | 0.2 | 231.447 | 647601 | 0.000000 |
| 16763 | Luxembourg | 2/27/2020 | 4.51 | 0.916 | 0.2 | 231.447 | 647601 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19868 | Ireland | 12/25/2022 | 2.96 | 0.955 | 0.2 | 69.874 | 5023108 | 0.491388 |
| 19869 | Ireland | 12/26/2022 | 2.96 | 0.955 | 0.2 | 69.874 | 5023108 | 0.491388 |
| 19870 | Ireland | 12/27/2022 | 2.96 | 0.955 | 0.2 | 69.874 | 5023108 | 0.491388 |
| 19871 | Ireland | 12/28/2022 | 2.96 | 0.955 | 0.2 | 69.874 | 5023108 | 0.491388 |
| 19872 | Ireland | 12/29/2022 | 2.96 | 0.955 | 0.2 | 69.874 | 5023108 | 0.491388 |
2076 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9981230977543898
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0038624036497071725 R2 Score: 0.9983085623212649 RMSE: 0.062148 Entropy Value: 0.0008228738012960621
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.920449 |
| 2 | extreme_poverty | 0.043623 |
| 0 | hospital_beds_per_thousand | 0.027483 |
| 3 | population_density | 0.007933 |
| 4 | population | 0.000512 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United States'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 99.739 | 5.59 | 22.6 | 28.9 | 83.78 | 18.436 | 43.1 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086136 |
| 27268 | United States | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.086032 |
| 27269 | United States | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.085212 |
| 27270 | United States | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084986 |
| 27271 | United States | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 38.3 | 1.084791 |
2112 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.982328888461846
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.014574168625324405 R2 Score: 0.9932005000328458 RMSE: 0.120724 Entropy Value: 0.0009401813968172323
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.934124 |
| 2 | female_smokers | 0.035472 |
| 6 | median_age | 0.010925 |
| 3 | male_smokers | 0.006711 |
| 5 | aged_65_older | 0.004865 |
| 0 | cardiovasc_death_rate | 0.004014 |
| 4 | life_expectancy | 0.003889 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Switzerland'
country2 = 'United States'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 13610 | Switzerland | 2/25/2020 | 4.53 | 0.955 | 0.03 | 214.243 | 8740471 | 0.000000 |
| 13611 | Switzerland | 2/26/2020 | 4.53 | 0.955 | 0.03 | 214.243 | 8740471 | 0.000000 |
| 13612 | Switzerland | 2/27/2020 | 4.53 | 0.955 | 0.03 | 214.243 | 8740471 | 0.000000 |
| 13613 | Switzerland | 2/28/2020 | 4.53 | 0.955 | 0.03 | 214.243 | 8740471 | 0.000000 |
| 13614 | Switzerland | 2/29/2020 | 4.53 | 0.955 | 0.03 | 214.243 | 8740471 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | United States | 12/25/2022 | 2.77 | 0.926 | 1.20 | 35.608 | 338289856 | 1.086136 |
| 27268 | United States | 12/26/2022 | 2.77 | 0.926 | 1.20 | 35.608 | 338289856 | 1.086032 |
| 27269 | United States | 12/27/2022 | 2.77 | 0.926 | 1.20 | 35.608 | 338289856 | 1.085212 |
| 27270 | United States | 12/28/2022 | 2.77 | 0.926 | 1.20 | 35.608 | 338289856 | 1.084986 |
| 27271 | United States | 12/29/2022 | 2.77 | 0.926 | 1.20 | 35.608 | 338289856 | 1.084791 |
2112 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best CV score: 0.9792894494169208
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.018987374157218375 R2 Score: 0.9911415427337641 RMSE: 0.137795 Entropy Value: 0.0011604753693985666
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.942162 |
| 2 | extreme_poverty | 0.041692 |
| 3 | population_density | 0.009401 |
| 4 | population | 0.004204 |
| 0 | hospital_beds_per_thousand | 0.002541 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 1 | Austria | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 2 | Austria | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 3 | Austria | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| 4 | Austria | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 44.4 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 114.898 | 4.29 | 25.1 | 31.4 | 81.63 | 18.571 | 41.8 | 0.711787 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985702277201526
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.020452761128882797 R2 Score: 0.9982725374621599 RMSE: 0.143013 Entropy Value: 0.0008156180119545589
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 6 | median_age | 0.853422 |
| 1 | diabetes_prevalence | 0.089967 |
| 0 | cardiovasc_death_rate | 0.036597 |
| 5 | aged_65_older | 0.011508 |
| 3 | male_smokers | 0.004570 |
| 2 | female_smokers | 0.003805 |
| 4 | life_expectancy | 0.000130 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Austria'
country2 = 'Belgium'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 0 | Austria | 2/25/2020 | 7.37 | 0.922 | 0.7 | 106.749 | 8939617 | 0.000000 |
| 1 | Austria | 2/26/2020 | 7.37 | 0.922 | 0.7 | 106.749 | 8939617 | 0.000000 |
| 2 | Austria | 2/27/2020 | 7.37 | 0.922 | 0.7 | 106.749 | 8939617 | 0.000000 |
| 3 | Austria | 2/28/2020 | 7.37 | 0.922 | 0.7 | 106.749 | 8939617 | 0.000000 |
| 4 | Austria | 2/29/2020 | 7.37 | 0.922 | 0.7 | 106.749 | 8939617 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2094 | Belgium | 12/25/2022 | 5.64 | 0.931 | 0.2 | 375.564 | 11655923 | 0.711787 |
| 2095 | Belgium | 12/26/2022 | 5.64 | 0.931 | 0.2 | 375.564 | 11655923 | 0.711787 |
| 2096 | Belgium | 12/27/2022 | 5.64 | 0.931 | 0.2 | 375.564 | 11655923 | 0.711787 |
| 2097 | Belgium | 12/28/2022 | 5.64 | 0.931 | 0.2 | 375.564 | 11655923 | 0.711787 |
| 2098 | Belgium | 12/29/2022 | 5.64 | 0.931 | 0.2 | 375.564 | 11655923 | 0.711787 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9974451044142482
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010069518672884694 R2 Score: 0.9991495174577224 RMSE: 0.100347 Entropy Value: 0.00034019584039697294
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.935823 |
| 2 | extreme_poverty | 0.050241 |
| 0 | hospital_beds_per_thousand | 0.009467 |
| 3 | population_density | 0.003676 |
| 4 | population | 0.000793 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 114.767 | 6.41 | 19.3 | 18.8 | 80.90 | 19.677 | 42.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 105.599 | 7.37 | 12.0 | 16.6 | 82.43 | 16.984 | 41.4 | 1.093162 |
2134 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9984540951428933
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002748350813295417 R2 Score: 0.9993440760119627 RMSE: 0.052425 Entropy Value: 0.00028602597187348704
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.742827 |
| 0 | cardiovasc_death_rate | 0.187586 |
| 6 | median_age | 0.034101 |
| 2 | female_smokers | 0.019551 |
| 5 | aged_65_older | 0.014214 |
| 3 | male_smokers | 0.001588 |
| 4 | life_expectancy | 0.000133 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Canada'
country2 = 'Denmark'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 5187 | Denmark | 2/2/2020 | 2.5 | 0.940 | 0.2 | 136.520 | 5882259 | 0.000000 |
| 5188 | Denmark | 2/3/2020 | 2.5 | 0.940 | 0.2 | 136.520 | 5882259 | 0.000000 |
| 5189 | Denmark | 2/4/2020 | 2.5 | 0.940 | 0.2 | 136.520 | 5882259 | 0.000000 |
| 5190 | Denmark | 2/5/2020 | 2.5 | 0.940 | 0.2 | 136.520 | 5882259 | 0.000000 |
| 5191 | Denmark | 2/6/2020 | 2.5 | 0.940 | 0.2 | 136.520 | 5882259 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 15716 | Canada | 12/25/2022 | 2.5 | 0.929 | 0.5 | 4.037 | 38454328 | 1.092509 |
| 15717 | Canada | 12/26/2022 | 2.5 | 0.929 | 0.5 | 4.037 | 38454328 | 1.092338 |
| 15718 | Canada | 12/27/2022 | 2.5 | 0.929 | 0.5 | 4.037 | 38454328 | 1.092196 |
| 15719 | Canada | 12/28/2022 | 2.5 | 0.929 | 0.5 | 4.037 | 38454328 | 1.092321 |
| 15720 | Canada | 12/29/2022 | 2.5 | 0.929 | 0.5 | 4.037 | 38454328 | 1.093162 |
2134 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985212702477563
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004149548610657574 R2 Score: 0.9990096648287803 RMSE: 0.064417 Entropy Value: 0.0005668080870825645
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.937978 |
| 0 | hospital_beds_per_thousand | 0.035832 |
| 2 | extreme_poverty | 0.021979 |
| 3 | population_density | 0.003812 |
| 4 | population | 0.000400 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'France'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7311 | Finland | 1/30/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7312 | Finland | 1/31/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7313 | Finland | 2/1/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| 7314 | Finland | 2/2/2020 | 153.507 | 5.76 | 18.3 | 22.6 | 81.91 | 21.228 | 42.8 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411710 |
| 9443 | France | 12/26/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411282 |
| 9444 | France | 12/27/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411730 |
| 9445 | France | 12/28/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411813 |
| 9446 | France | 12/29/2022 | 86.060 | 4.77 | 30.1 | 35.6 | 82.66 | 19.718 | 42.0 | 0.411892 |
2137 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9901279701885508
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.03485209804982485 R2 Score: 0.9965523687852408 RMSE: 0.186687 Entropy Value: 0.0021807550391466923
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.519750 |
| 0 | cardiovasc_death_rate | 0.429350 |
| 2 | female_smokers | 0.034202 |
| 5 | aged_65_older | 0.006474 |
| 3 | male_smokers | 0.004594 |
| 6 | median_age | 0.004512 |
| 4 | life_expectancy | 0.001119 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Finland'
country2 = 'France'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 7310 | Finland | 1/29/2020 | 3.28 | 0.938 | 0.04 | 18.136 | 5540745 | 0.000000 |
| 7311 | Finland | 1/30/2020 | 3.28 | 0.938 | 0.04 | 18.136 | 5540745 | 0.000000 |
| 7312 | Finland | 1/31/2020 | 3.28 | 0.938 | 0.04 | 18.136 | 5540745 | 0.000000 |
| 7313 | Finland | 2/1/2020 | 3.28 | 0.938 | 0.04 | 18.136 | 5540745 | 0.000000 |
| 7314 | Finland | 2/2/2020 | 3.28 | 0.938 | 0.04 | 18.136 | 5540745 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9442 | France | 12/25/2022 | 5.98 | 0.901 | 0.02 | 122.578 | 67813000 | 0.411710 |
| 9443 | France | 12/26/2022 | 5.98 | 0.901 | 0.02 | 122.578 | 67813000 | 0.411282 |
| 9444 | France | 12/27/2022 | 5.98 | 0.901 | 0.02 | 122.578 | 67813000 | 0.411730 |
| 9445 | France | 12/28/2022 | 5.98 | 0.901 | 0.02 | 122.578 | 67813000 | 0.411813 |
| 9446 | France | 12/29/2022 | 5.98 | 0.901 | 0.02 | 122.578 | 67813000 | 0.411892 |
2137 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9910318902554458
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.035576227278599215 R2 Score: 0.9964807366404823 RMSE: 0.188617 Entropy Value: 0.0016675150996863651
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.937639 |
| 2 | extreme_poverty | 0.039984 |
| 0 | hospital_beds_per_thousand | 0.012847 |
| 3 | population_density | 0.007729 |
| 4 | population | 0.001800 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Italy'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 20911 | Iceland | 2/28/2020 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20912 | Iceland | 2/29/2020 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20913 | Iceland | 3/1/2020 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20914 | Iceland | 3/2/2020 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| 20915 | Iceland | 3/3/2020 | 117.992 | 5.31 | 14.3 | 15.2 | 82.99 | 14.431 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 113.151 | 4.78 | 19.8 | 27.8 | 83.51 | 23.021 | 47.9 | 0.735109 |
2100 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9993936365790189
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.02379433705546307 R2 Score: 0.9980488034443954 RMSE: 0.154254 Entropy Value: 0.0014017849110386118
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 0 | cardiovasc_death_rate | 0.472670 |
| 1 | diabetes_prevalence | 0.340589 |
| 6 | median_age | 0.162234 |
| 2 | female_smokers | 0.021667 |
| 3 | male_smokers | 0.001448 |
| 5 | aged_65_older | 0.001235 |
| 4 | life_expectancy | 0.000157 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Iceland'
country2 = 'Italy'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 20911 | Iceland | 2/28/2020 | 2.91 | 0.949 | 0.2 | 3.404 | 372903 | 0.000000 |
| 20912 | Iceland | 2/29/2020 | 2.91 | 0.949 | 0.2 | 3.404 | 372903 | 0.000000 |
| 20913 | Iceland | 3/1/2020 | 2.91 | 0.949 | 0.2 | 3.404 | 372903 | 0.000000 |
| 20914 | Iceland | 3/2/2020 | 2.91 | 0.949 | 0.2 | 3.404 | 372903 | 0.000000 |
| 20915 | Iceland | 3/3/2020 | 2.91 | 0.949 | 0.2 | 3.404 | 372903 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23006 | Italy | 12/25/2022 | 3.18 | 0.892 | 2.0 | 205.859 | 59037472 | 0.735109 |
| 23007 | Italy | 12/26/2022 | 3.18 | 0.892 | 2.0 | 205.859 | 59037472 | 0.735109 |
| 23008 | Italy | 12/27/2022 | 3.18 | 0.892 | 2.0 | 205.859 | 59037472 | 0.735109 |
| 23009 | Italy | 12/28/2022 | 3.18 | 0.892 | 2.0 | 205.859 | 59037472 | 0.735109 |
| 23010 | Italy | 12/29/2022 | 3.18 | 0.892 | 2.0 | 205.859 | 59037472 | 0.735109 |
2100 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9993675631994823
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.021371276450830683 R2 Score: 0.9982475006173722 RMSE: 0.146189 Entropy Value: 0.001247239760700292
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.929630 |
| 0 | hospital_beds_per_thousand | 0.045362 |
| 2 | extreme_poverty | 0.023022 |
| 3 | population_density | 0.001775 |
| 4 | population | 0.000212 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 109.361 | 5.29 | 24.4 | 27.3 | 82.28 | 18.779 | 43.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 133.982 | 4.79 | 18.8 | 18.9 | 82.80 | 19.985 | 41.0 | 0.816005 |
2100 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9984656768745841
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.010402460269562753 R2 Score: 0.9990051222850926 RMSE: 0.101992 Entropy Value: 0.00040629309095891516
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.973529 |
| 2 | female_smokers | 0.023056 |
| 3 | male_smokers | 0.001375 |
| 0 | cardiovasc_death_rate | 0.000910 |
| 5 | aged_65_older | 0.000438 |
| 4 | life_expectancy | 0.000347 |
| 6 | median_age | 0.000345 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Netherlands'
country2 = 'Sweden'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 9447 | Netherlands | 2/27/2020 | 3.32 | 0.944 | 0.1 | 508.544 | 17564020 | 0.000000 |
| 9448 | Netherlands | 2/28/2020 | 3.32 | 0.944 | 0.1 | 508.544 | 17564020 | 0.000000 |
| 9449 | Netherlands | 2/29/2020 | 3.32 | 0.944 | 0.1 | 508.544 | 17564020 | 0.000000 |
| 9450 | Netherlands | 3/1/2020 | 3.32 | 0.944 | 0.1 | 508.544 | 17564020 | 0.000000 |
| 9451 | Netherlands | 3/2/2020 | 3.32 | 0.944 | 0.1 | 508.544 | 17564020 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 24069 | Sweden | 12/25/2022 | 2.22 | 0.945 | 0.5 | 24.718 | 10549349 | 0.811466 |
| 24070 | Sweden | 12/26/2022 | 2.22 | 0.945 | 0.5 | 24.718 | 10549349 | 0.811466 |
| 24071 | Sweden | 12/27/2022 | 2.22 | 0.945 | 0.5 | 24.718 | 10549349 | 0.811466 |
| 24072 | Sweden | 12/28/2022 | 2.22 | 0.945 | 0.5 | 24.718 | 10549349 | 0.811466 |
| 24073 | Sweden | 12/29/2022 | 2.22 | 0.945 | 0.5 | 24.718 | 10549349 | 0.816005 |
2100 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9985072666667036
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.011437542353413214 R2 Score: 0.9989061283863766 RMSE: 0.106946 Entropy Value: 0.0004613073619798649
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.974072 |
| 2 | extreme_poverty | 0.023855 |
| 3 | population_density | 0.001627 |
| 4 | population | 0.000396 |
| 0 | hospital_beds_per_thousand | 0.000050 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Bulgaria'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 424.688 | 5.81 | 30.1 | 44.4 | 75.05 | 20.801 | 44.7 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 122.137 | 4.28 | 20.0 | 24.7 | 81.32 | 18.517 | 40.8 | 0.883564 |
2090 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Best CV score: 0.944267642996866
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 1.43474289655929 R2 Score: 0.9257898367511572 RMSE: 1.197808 Entropy Value: 0.01142862992057888
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.867049 |
| 6 | median_age | 0.035735 |
| 2 | female_smokers | 0.027527 |
| 3 | male_smokers | 0.020658 |
| 4 | life_expectancy | 0.018178 |
| 0 | cardiovasc_death_rate | 0.016531 |
| 5 | aged_65_older | 0.014323 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'United Kingdom'
country2 = 'Bulgaria'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 2099 | Bulgaria | 3/8/2020 | 7.454 | 0.816 | 1.5 | 65.180 | 6781955 | 0.000000 |
| 2100 | Bulgaria | 3/9/2020 | 7.454 | 0.816 | 1.5 | 65.180 | 6781955 | 0.000000 |
| 2101 | Bulgaria | 3/10/2020 | 7.454 | 0.816 | 1.5 | 65.180 | 6781955 | 0.000000 |
| 2102 | Bulgaria | 3/11/2020 | 7.454 | 0.816 | 1.5 | 65.180 | 6781955 | 14.285714 |
| 2103 | Bulgaria | 3/12/2020 | 7.454 | 0.816 | 1.5 | 65.180 | 6781955 | 14.285714 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13605 | United Kingdom | 12/25/2022 | 2.540 | 0.932 | 0.2 | 272.898 | 67508936 | 0.883564 |
| 13606 | United Kingdom | 12/26/2022 | 2.540 | 0.932 | 0.2 | 272.898 | 67508936 | 0.883564 |
| 13607 | United Kingdom | 12/27/2022 | 2.540 | 0.932 | 0.2 | 272.898 | 67508936 | 0.883564 |
| 13608 | United Kingdom | 12/28/2022 | 2.540 | 0.932 | 0.2 | 272.898 | 67508936 | 0.883564 |
| 13609 | United Kingdom | 12/29/2022 | 2.540 | 0.932 | 0.2 | 272.898 | 67508936 | 0.883564 |
2090 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score: 0.9339708697999024
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.7342554973835562 R2 Score: 0.9620216134487464 RMSE: 0.856887 Entropy Value: 0.007601507191777842
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.881448 |
| 2 | extreme_poverty | 0.054324 |
| 4 | population | 0.037373 |
| 3 | population_density | 0.025757 |
| 0 | hospital_beds_per_thousand | 0.001098 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 141.171 | 9.24 | 19.6 | 52.7 | 80.98 | 13.416 | 37.3 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 227.485 | 6.82 | 30.5 | 38.3 | 79.38 | 19.027 | 43.3 | 0.919575 |
2061 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9931354068766577
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.00047399699097167984 R2 Score: 0.9991753265149302 RMSE: 0.021771 Entropy Value: 0.00015620474538757492
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.658891 |
| 0 | cardiovasc_death_rate | 0.219210 |
| 5 | aged_65_older | 0.052242 |
| 6 | median_age | 0.044419 |
| 2 | female_smokers | 0.022219 |
| 3 | male_smokers | 0.001522 |
| 4 | life_expectancy | 0.001498 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Cyprus'
country2 = 'Czechia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 3126 | Cyprus | 3/8/2020 | 3.40 | 0.887 | 0.15 | 127.657 | 896007 | 0.000000 |
| 3127 | Cyprus | 3/9/2020 | 3.40 | 0.887 | 0.15 | 127.657 | 896007 | 0.000000 |
| 3128 | Cyprus | 3/10/2020 | 3.40 | 0.887 | 0.15 | 127.657 | 896007 | 0.000000 |
| 3129 | Cyprus | 3/11/2020 | 3.40 | 0.887 | 0.15 | 127.657 | 896007 | 0.000000 |
| 3130 | Cyprus | 3/12/2020 | 3.40 | 0.887 | 0.15 | 127.657 | 896007 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5182 | Czechia | 12/25/2022 | 6.63 | 0.900 | 0.00 | 137.176 | 10493990 | 0.919258 |
| 5183 | Czechia | 12/26/2022 | 6.63 | 0.900 | 0.00 | 137.176 | 10493990 | 0.919368 |
| 5184 | Czechia | 12/27/2022 | 6.63 | 0.900 | 0.00 | 137.176 | 10493990 | 0.919431 |
| 5185 | Czechia | 12/28/2022 | 6.63 | 0.900 | 0.00 | 137.176 | 10493990 | 0.919430 |
| 5186 | Czechia | 12/29/2022 | 6.63 | 0.900 | 0.00 | 137.176 | 10493990 | 0.919575 |
2061 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9866151912407417
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.008157893614715191 R2 Score: 0.9858066639953047 RMSE: 0.090321 Entropy Value: 0.002246714925831334
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.818042 |
| 0 | hospital_beds_per_thousand | 0.113415 |
| 2 | extreme_poverty | 0.051214 |
| 3 | population_density | 0.013891 |
| 4 | population | 0.003438 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Latvia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 255.569 | 4.02 | 24.5 | 39.3 | 78.74 | 19.452 | 42.7 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 350.060 | 4.91 | 25.6 | 51.0 | 75.29 | 19.754 | 43.9 | 0.631969 |
2099 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9979817144366736
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0018696016902774828 R2 Score: 0.9968998375395624 RMSE: 0.043239 Entropy Value: 0.0007273351883650379
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.954112 |
| 2 | female_smokers | 0.017664 |
| 0 | cardiovasc_death_rate | 0.012118 |
| 5 | aged_65_older | 0.010583 |
| 6 | median_age | 0.004349 |
| 3 | male_smokers | 0.000941 |
| 4 | life_expectancy | 0.000232 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Estonia'
country2 = 'Latvia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 6249 | Estonia | 1/6/2020 | 4.69 | 0.892 | 0.5 | 31.033 | 1326064 | 0.000000 |
| 6250 | Estonia | 1/18/2020 | 4.69 | 0.892 | 0.5 | 31.033 | 1326064 | 0.000000 |
| 6251 | Estonia | 2/5/2020 | 4.69 | 0.892 | 0.5 | 31.033 | 1326064 | 0.000000 |
| 6252 | Estonia | 2/6/2020 | 4.69 | 0.892 | 0.5 | 31.033 | 1326064 | 0.000000 |
| 6253 | Estonia | 2/7/2020 | 4.69 | 0.892 | 0.5 | 31.033 | 1326064 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20906 | Latvia | 12/25/2022 | 5.57 | 0.866 | 0.7 | 31.212 | 1850654 | 0.631631 |
| 20907 | Latvia | 12/26/2022 | 5.57 | 0.866 | 0.7 | 31.212 | 1850654 | 0.631631 |
| 20908 | Latvia | 12/27/2022 | 5.57 | 0.866 | 0.7 | 31.212 | 1850654 | 0.631485 |
| 20909 | Latvia | 12/28/2022 | 5.57 | 0.866 | 0.7 | 31.212 | 1850654 | 0.631485 |
| 20910 | Latvia | 12/29/2022 | 5.57 | 0.866 | 0.7 | 31.212 | 1850654 | 0.631969 |
2099 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9975170379181927
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.002618029320035362 R2 Score: 0.995658799272323 RMSE: 0.051167 Entropy Value: 0.0010107185902514522
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.960201 |
| 2 | extreme_poverty | 0.023049 |
| 0 | hospital_beds_per_thousand | 0.014483 |
| 3 | population_density | 0.001800 |
| 4 | population | 0.000466 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Romania'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 127.842 | 9.85 | 16.3 | 30.0 | 82.05 | 21.502 | 46.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 370.946 | 9.74 | 22.9 | 37.1 | 76.05 | 17.850 | 43.0 | 2.036403 |
2072 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9962992681061019
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.005699053732913187 R2 Score: 0.9966620033088617 RMSE: 0.075492 Entropy Value: 0.00040310281139748264
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.590038 |
| 0 | cardiovasc_death_rate | 0.335347 |
| 5 | aged_65_older | 0.038765 |
| 2 | female_smokers | 0.020746 |
| 6 | median_age | 0.008911 |
| 3 | male_smokers | 0.005290 |
| 4 | life_expectancy | 0.000903 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Portugal'
country2 = 'Romania'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 10484 | Portugal | 3/1/2020 | 3.390 | 0.864 | 0.5 | 112.371 | 10270857 | 0.000000 |
| 10485 | Portugal | 3/2/2020 | 3.390 | 0.864 | 0.5 | 112.371 | 10270857 | 0.000000 |
| 10486 | Portugal | 3/3/2020 | 3.390 | 0.864 | 0.5 | 112.371 | 10270857 | 0.000000 |
| 10487 | Portugal | 3/4/2020 | 3.390 | 0.864 | 0.5 | 112.371 | 10270857 | 0.000000 |
| 10488 | Portugal | 3/5/2020 | 3.390 | 0.864 | 0.5 | 112.371 | 10270857 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18833 | Romania | 12/25/2022 | 6.892 | 0.828 | 5.7 | 85.129 | 19659270 | 2.037520 |
| 18834 | Romania | 12/26/2022 | 6.892 | 0.828 | 5.7 | 85.129 | 19659270 | 2.036403 |
| 18835 | Romania | 12/27/2022 | 6.892 | 0.828 | 5.7 | 85.129 | 19659270 | 2.036403 |
| 18836 | Romania | 12/28/2022 | 6.892 | 0.828 | 5.7 | 85.129 | 19659270 | 2.036403 |
| 18837 | Romania | 12/29/2022 | 6.892 | 0.828 | 5.7 | 85.129 | 19659270 | 2.036403 |
2072 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best CV score: 0.9971895025318556
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.009693221423902118 R2 Score: 0.9943225765967789 RMSE: 0.098454 Entropy Value: 0.0009866503695753954
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.881243 |
| 0 | hospital_beds_per_thousand | 0.078831 |
| 2 | extreme_poverty | 0.029944 |
| 3 | population_density | 0.009145 |
| 4 | population | 0.000837 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 287.959 | 7.29 | 23.1 | 37.7 | 77.54 | 15.070 | 41.2 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 439.415 | 10.08 | 37.7 | 40.2 | 76.00 | 17.366 | 41.2 | 0.716205 |
2067 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9930228916370709
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0009018432472984047 R2 Score: 0.996309224905843 RMSE: 0.030031 Entropy Value: 0.0004574118660922552
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.872557 |
| 6 | median_age | 0.095584 |
| 2 | female_smokers | 0.017262 |
| 5 | aged_65_older | 0.008445 |
| 3 | male_smokers | 0.003182 |
| 0 | cardiovasc_death_rate | 0.001540 |
| 4 | life_expectancy | 0.001430 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Serbia'
country2 = 'Slovakia'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 11518 | Slovakia | 3/6/2020 | 5.820 | 0.860 | 0.70 | 113.128 | 5643455 | 0.000000 |
| 11519 | Slovakia | 3/7/2020 | 5.820 | 0.860 | 0.70 | 113.128 | 5643455 | 0.000000 |
| 11520 | Slovakia | 3/8/2020 | 5.820 | 0.860 | 0.70 | 113.128 | 5643455 | 0.000000 |
| 11521 | Slovakia | 3/9/2020 | 5.820 | 0.860 | 0.70 | 113.128 | 5643455 | 0.000000 |
| 11522 | Slovakia | 3/10/2020 | 5.820 | 0.860 | 0.70 | 113.128 | 5643455 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16754 | Serbia | 12/25/2022 | 5.609 | 0.806 | 0.05 | 80.291 | 6871547 | 0.717058 |
| 16755 | Serbia | 12/26/2022 | 5.609 | 0.806 | 0.05 | 80.291 | 6871547 | 0.716963 |
| 16756 | Serbia | 12/27/2022 | 5.609 | 0.806 | 0.05 | 80.291 | 6871547 | 0.716677 |
| 16757 | Serbia | 12/28/2022 | 5.609 | 0.806 | 0.05 | 80.291 | 6871547 | 0.716395 |
| 16758 | Serbia | 12/29/2022 | 5.609 | 0.806 | 0.05 | 80.291 | 6871547 | 0.716205 |
2067 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9925952418557594
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.0008275646091810585 R2 Score: 0.9966132087172344 RMSE: 0.028767 Entropy Value: 0.000479760324669255
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.911824 |
| 0 | hospital_beds_per_thousand | 0.045153 |
| 2 | extreme_poverty | 0.034897 |
| 3 | population_density | 0.006234 |
| 4 | population | 0.001892 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Spain'
# Extracting important features for Random Forest Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | median_age | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 99.403 | 7.17 | 27.4 | 31.4 | 83.56 | 19.436 | 45.5 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 153.493 | 7.25 | 20.1 | 25.0 | 81.32 | 19.062 | 44.5 | 0.536669 |
2125 rows × 10 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 7 as this equals the number of input variables for the Random Forest Model Analysis for the population health index
n_components = 7 # of input variables for the Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.9985269601145046
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.004771901735090741 R2 Score: 0.9992681453869718 RMSE: 0.069079 Entropy Value: 0.00029331989492224016
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | diabetes_prevalence | 0.895143 |
| 0 | cardiovasc_death_rate | 0.074364 |
| 2 | female_smokers | 0.019122 |
| 6 | median_age | 0.004778 |
| 3 | male_smokers | 0.003564 |
| 5 | aged_65_older | 0.002714 |
| 4 | life_expectancy | 0.000316 |
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
| date | cardiovasc_death_rate | diabetes_prevalence | female_smokers | male_smokers | life_expectancy | aged_65_older | aged_70_older | median_age | hospital_beds_per_thousand | human_development_index | extreme_poverty | gdp_per_capita | population_density | population | Mortality Rate | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2/25/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 1 | 2/26/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 2 | 2/27/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 3 | 2/28/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| 4 | 2/29/2020 | 145.183 | 6.35 | 28.4 | 30.9 | 81.54 | 19.202 | 13.748 | 44.4 | 7.37 | 0.922 | 0.7 | 45436.686 | 106.749 | 8939617 | 0.000000 | Austria |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27267 | 12/25/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086136 | United States |
| 27268 | 12/26/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.086032 | United States |
| 27269 | 12/27/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.085212 | United States |
| 27270 | 12/28/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084986 | United States |
| 27271 | 12/29/2022 | 151.089 | 10.79 | 19.1 | 24.6 | 78.86 | 15.413 | 9.732 | 38.3 | 2.77 | 0.926 | 1.2 | 54225.446 | 35.608 | 338289856 | 1.084791 | United States |
27272 rows × 17 columns
country1 = 'Slovenia'
country2 = 'Spain'
# Extracting important features for the Random Forest Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
df_updated
| location | date | hospital_beds_per_thousand | human_development_index | extreme_poverty | population_density | population | Mortality Rate | |
|---|---|---|---|---|---|---|---|---|
| 24074 | Spain | 2/1/2020 | 2.97 | 0.904 | 1.0 | 93.105 | 47558632 | 0.000000 |
| 24075 | Spain | 2/2/2020 | 2.97 | 0.904 | 1.0 | 93.105 | 47558632 | 0.000000 |
| 24076 | Spain | 2/3/2020 | 2.97 | 0.904 | 1.0 | 93.105 | 47558632 | 0.000000 |
| 24077 | Spain | 2/4/2020 | 2.97 | 0.904 | 1.0 | 93.105 | 47558632 | 0.000000 |
| 24078 | Spain | 2/5/2020 | 2.97 | 0.904 | 1.0 | 93.105 | 47558632 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26194 | Slovenia | 12/25/2022 | 4.50 | 0.917 | 0.0 | 102.619 | 2119843 | 0.537665 |
| 26195 | Slovenia | 12/26/2022 | 4.50 | 0.917 | 0.0 | 102.619 | 2119843 | 0.537924 |
| 26196 | Slovenia | 12/27/2022 | 4.50 | 0.917 | 0.0 | 102.619 | 2119843 | 0.537894 |
| 26197 | Slovenia | 12/28/2022 | 4.50 | 0.917 | 0.0 | 102.619 | 2119843 | 0.537128 |
| 26198 | Slovenia | 12/29/2022 | 4.50 | 0.917 | 0.0 | 102.619 | 2119843 | 0.536669 |
2125 rows × 8 columns
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate.
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the Random Forest Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature. A Random Forest model is a type of ensemble learning algorithm that is typically used for non-sequential data.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the Random Forest Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Setting the number of principal components to 5 as this equals the number of input variables for the Random Forest Model Analysis for the country health index
n_components = 5 # of input variables for Random Forest Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values
# Split the dataset into training set and testing set for Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
# Instantiate the RandomForestRegressor Model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
# define parameter grid
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
}
# perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train_scaled, y_train)
# print best hyperparameters and corresponding mean cross-validation score
print('Best hyperparameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 0.9982082361713068
# fit random forest model with best hyperparameters from above
best_rf_model = RandomForestRegressor(n_estimators=grid_search.best_params_['n_estimators'],
max_depth=grid_search.best_params_['max_depth'],
min_samples_split=grid_search.best_params_['min_samples_split'],
min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
random_state=42)
best_rf_model.fit(X_train_scaled, y_train)
y_pred = best_rf_model.predict(X_test_scaled)
# Evaluate the performance of the Random Forest Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE: 0.006855328683786699 R2 Score: 0.9989486154159127 RMSE: 0.082797 Entropy Value: 0.00046734218153940193
feature_importances = best_rf_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
| feature | importance | |
|---|---|---|
| 1 | human_development_index | 0.961494 |
| 2 | extreme_poverty | 0.024439 |
| 0 | hospital_beds_per_thousand | 0.009664 |
| 3 | population_density | 0.004009 |
| 4 | population | 0.000394 |